%%% -*-BibTeX-*- %%% ==================================================================== %%% BibTeX-file{ %%% author = "Nelson H. F. Beebe", %%% version = "3.202", %%% date = "31 December 2025", %%% time = "07:57:46 MDT", %%% filename = "multithreading.bib", %%% address = "University of Utah %%% Department of Mathematics, 110 LCB %%% 155 S 1400 E RM 233 %%% Salt Lake City, UT 84112-0090 %%% USA", %%% telephone = "+1 801 581 5254", %%% URL = "https://www.math.utah.edu/~beebe", %%% checksum = "42765 61273 291920 2991525", %%% email = "beebe at math.utah.edu, beebe at acm.org, %%% beebe at computer.org (Internet)", %%% codetable = "ISO/ASCII", %%% keywords = "multithreading; OpenMP; POSIX; pthreads; %%% threads; UNIX; Win32; Windows NT", %%% license = "public domain", %%% supported = "no", %%% docstring = "This bibliography covers publications about %%% multithreaded programming. %%% %%% At version 3.202, the year coverage looked %%% like this: %%% %%% 1973 ( 1) 1991 ( 42) 2009 ( 62) %%% 1974 ( 0) 1992 ( 49) 2010 ( 59) %%% 1975 ( 0) 1993 ( 47) 2011 ( 37) %%% 1976 ( 0) 1994 ( 71) 2012 ( 71) %%% 1977 ( 0) 1995 ( 90) 2013 ( 40) %%% 1978 ( 0) 1996 ( 82) 2014 ( 56) %%% 1979 ( 1) 1997 ( 86) 2015 ( 57) %%% 1980 ( 1) 1998 ( 86) 2016 ( 58) %%% 1981 ( 0) 1999 ( 70) 2017 ( 49) %%% 1982 ( 0) 2000 ( 82) 2018 ( 42) %%% 1983 ( 0) 2001 ( 59) 2019 ( 43) %%% 1984 ( 0) 2002 ( 66) 2020 ( 19) %%% 1985 ( 0) 2003 ( 60) 2021 ( 20) %%% 1986 ( 1) 2004 ( 37) 2022 ( 22) %%% 1987 ( 2) 2005 ( 31) 2023 ( 12) %%% 1988 ( 2) 2006 ( 50) 2024 ( 7) %%% 1989 ( 15) 2007 ( 48) 2025 ( 2) %%% 1990 ( 16) 2008 ( 59) %%% 19xx ( 1) %%% %%% Article: 1476 %%% Book: 53 %%% InBook: 1 %%% InCollection: 1 %%% InProceedings: 105 %%% Manual: 4 %%% MastersThesis: 37 %%% Misc: 2 %%% PhdThesis: 22 %%% Proceedings: 69 %%% TechReport: 41 %%% %%% Total entries: 1811 %%% %%% OpenMP is an ``Application Program Interface %%% (API) supports multi-platform shared-memory %%% parallel programming in C/C++ and Fortran on %%% all architectures, including Unix platforms %%% and Windows NT platforms. Jointly defined by %%% a group of major computer hardware and %%% software vendors, OpenMP is a portable, %%% scalable model that gives shared-memory %%% parallel programmers a simple and flexible %%% interface for developing parallel %%% applications for platforms ranging from the %%% desktop to the supercomputer.'' [from the %%% OpenMP Web site]. For details, visit %%% %%% http://www.openmp.org/ %%% %%% At least two vendors, Kuck & Associates (KAI), %%% %%% http://www.kai.com/parallel/openmp.html %%% %%% and the Portland Group, Inc. (PGI) %%% %%% http://www.pgroup.com/ppro_docs/pgiws_ug/pgi31u11.htm %%% http://www.pgroup.com/ppro_docs/pgiws_ug/pgi31u12.htm %%% %%% provide extensive support of OpenMP. %%% %%% BibTeX citation tags are uniformly chosen as %%% name:year:abbrev, where name is the family %%% name of the first author or editor, year is a %%% 4-digit number, and abbrev is a 3-letter %%% condensation of important title words. %%% Citation tags were automatically generated by %%% software developed for the BibNet Project. %%% %%% In this bibliography, entries are sorted %%% first by ascending year, and within each %%% year, alphabetically by author or editor, %%% and then, if necessary, by the 3-letter %%% abbreviation at the end of the BibTeX %%% citation tag, using the bibsort -byyear %%% utility. Year order has been chosen to %%% make it easier to identify the most recent %%% work. %%% %%% The checksum field above contains a CRC-16 %%% checksum as the first value, followed by the %%% equivalent of the standard UNIX wc (word %%% count) utility output of lines, words, and %%% characters. This is produced by Robert %%% Solovay's checksum utility.", %%% } %%% ==================================================================== @Preamble{ "\ifx \undefined \pkg \def \pkg #1{{{\tt #1}}} \fi" } %%% ==================================================================== %%% Acknowledgement abbreviations: @String{ack-nhfb = "Nelson H. F. Beebe, University of Utah, Department of Mathematics, 110 LCB, 155 S 1400 E RM 233, Salt Lake City, UT 84112-0090, USA, Tel: +1 801 581 5254, e-mail: \path|beebe@math.utah.edu|, \path|beebe@acm.org|, \path|beebe@computer.org| (Internet), URL: \path|https://www.math.utah.edu/~beebe/|"} %%% ==================================================================== %%% Institution abbreviations: @String{inst-ATT-BELL = "AT\&T Bell Laboratories"} @String{inst-ATT-BELL:adr = "Murray Hill, NJ, USA"} @String{inst-CSC = "Center for Scientific Computing, Department of Mathematics, University of Utah"} @String{inst-CSC:adr = "Salt Lake City, UT 84112, USA"} @String{inst-CSU = "Colorado State University"} @String{inst-CSU:adr = "Fort Collins, CO, USA"} @String{inst-NLRC = "NASA Langley Research Center"} @String{inst-NLRC:adr = "Hampton, VA, USA"} @String{inst-SRC-IDA = "Supercomputing Research Center: IDA"} @String{inst-SRC-IDA:adr = "Lanham, MD, USA"} @String{inst-U-MARYLAND = "University of Maryland"} @String{inst-U-MARYLAND:adr = "College Park, MD, USA"} @String{inst-UCB-EECS = "Department of Electrical Engineering and Computer Science, University of California, Berkeley"} @String{inst-UCB-EECS:adr = "Berkeley, CA, USA"} @String{inst-UIUC-CSRD = "University of Illinois at Urbana-Champaign, Center for Supercomputing Research and Development"} @String{inst-UIUC-CSRD:adr = "Urbana, IL 61801, USA"} @String{inst-UT-CS = "Department of Computer Science, University of Tennessee, Knoxville"} @String{inst-UT-CS:adr = "Knoxville, TN 37996, USA"} %%% ==================================================================== %%% Journal abbreviations: @String{j-ACM-COMM-COMP-ALGEBRA = "ACM Communications in Computer Algebra"} @String{j-ACM-J-EXP-ALGORITHMICS = "ACM Journal of Experimental Algorithmics"} @String{j-ACTA-INFO = "Acta Informatica"} @String{j-ADA-USER = "Ada User"} @String{j-ALGORITHMICA = "Algorithmica"} @String{j-ALGORITHMS-BASEL = "Algorithms ({Basel})"} @String{j-APPL-MATH-COMP = "Applied Mathematics and Computation"} @String{j-APPL-NUM-MATH = "Applied Numerical Mathematics: Transactions of IMACS"} @String{j-BYTE = "Byte Magazine"} @String{j-C-PLUS-PLUS-REPORT = "C++ Report"} @String{j-CACM = "Communications of the ACM"} @String{j-CCCUJ = "C/C++ Users Journal"} @String{j-CCPE = "Concurrency and Computation: Prac\-tice and Experience"} @String{j-CG-WORLD = "Computer Graphics World"} @String{j-COMP-ARCH-NEWS = "ACM SIGARCH Computer Architecture News"} @String{j-COMP-GRAPHICS = "Computer Graphics"} @String{j-COMP-J = "The Computer Journal"} @String{j-COMP-NET-AMSTERDAM = "Computer Networks (Amsterdam, Netherlands: 1999)"} @String{j-COMP-PHYS-COMM = "Computer Physics Communications"} @String{j-COMP-SURV = "ACM Computing Surveys"} @String{j-COMP-SYS = "Computing Systems"} @String{j-COMPUT-MATH-APPL = "Computers and Mathematics with Applications"} @String{j-COMPUT-PHYS = "Computers in Physics"} @String{j-COMPUT-SCI-ENG = "Computing in Science and Engineering"} @String{j-COMPUTER = "Computer"} @String{j-COMPUTERS-AND-GRAPHICS = "Computers and Graphics"} @String{j-COMPUTING = "Computing"} @String{j-CPE = "Concurrency: Prac\-tice and Experience"} @String{j-CUJ = "C Users Journal"} @String{j-DATAMATION = "Datamation"} @String{j-DDJ = "Dr. Dobb's Journal of Software Tools"} @String{j-DEC-TECH-J = "Digital Technical Journal"} @String{j-DISTRIB-COMPUT = "Distributed Computing"} @String{j-ELECTRONIK = "Elektronik"} @String{j-FORM-ASP-COMPUT = "Formal Aspects of Computing"} @String{j-FUND-INFO = "Fundamenta Informaticae"} @String{j-FUT-GEN-COMP-SYS = "Future Generation Computer Systems"} @String{j-HIGHER-ORDER-SYMB-COMPUT = "Higher-Order and Symbolic Computation"} @String{j-IBM-JRD = "IBM Journal of Research and Development"} @String{j-IBM-SYS-J = "IBM Systems Journal"} @String{j-IEEE-CGA = "IEEE Computer Graphics and Applications"} @String{j-IEEE-COMPUT-ARCHIT-LETT = "IEEE Computer Architecture Letters"} @String{j-IEEE-COMPUT-SCI-ENG = "IEEE Computational Science \& Engineering"} @String{j-IEEE-CONCURR = "IEEE Concurrency"} @String{j-IEEE-DISTRIB-SYST-ONLINE = "IEEE Distributed Systems Online"} @String{j-IEEE-INT-SYMP-HIGH-PERF-DIST-COMP-PROC = "IEEE International Symposium on High Performance Distributed Computing, Proceedings"} @String{j-IEEE-MICRO = "IEEE Micro"} @String{j-IEEE-PAR-DIST-TECH = "IEEE parallel and distributed technology: systems and applications"} @String{j-IEEE-SOFTWARE = "IEEE Software"} @String{j-IEEE-SPECTRUM = "IEEE Spectrum"} @String{j-IEEE-TRANS-BIG-DATA = "IEEE Transactions on Big Data"} @String{j-IEEE-TRANS-CIRCUITS-SYST-1 = "IEEE Transactions on Circuits and Systems I: Regular Papers"} @String{j-IEEE-TRANS-COMPUT = "IEEE Transactions on Computers"} @String{j-IEEE-TRANS-PAR-DIST-SYS = "IEEE Transactions on Parallel and Distributed Systems"} @String{j-IEEE-TRANS-SOFTW-ENG = "IEEE Transactions on Software Engineering"} @String{j-IEEE-TRANS-VIS-COMPUT-GRAPH = "IEEE Transactions on Visualization and Computer Graphics"} @String{j-IJHPCA = "The International Journal of High Performance Computing Applications"} @String{j-IJQC = "International Journal of Quantum Chemistry"} @String{j-INFO-PROC-LETT = "Information Processing Letters"} @String{j-INT-J-COMP-APPL = "International Journal of Computer Applications"} @String{j-INT-J-COMPUT-APPL = "International Journal of Computers and Applications"} @String{j-INT-J-COMPUT-SYST-SCI-ENG = "International Journal of Computer Systems Science and Engineering"} @String{j-INT-J-HIGH-SPEED-COMPUTING = "International Journal of High Speed Computing (IJHSC)"} @String{j-INT-J-PAR-EMER-DIST-SYS = "International Journal of Parallel, Emergent and Distributed Systems: IJPEDS"} @String{j-INT-J-PARALLEL-PROG = "International Journal of Parallel Programming"} @String{j-INT-J-SOFTW-TOOLS-TECHNOL-TRANSFER = "International Journal on Software Tools for Technology Transfer (STTT)"} @String{j-INTEL-TECH-J = "Intel Technology Journal"} @String{j-J-ACM = "Journal of the ACM"} @String{j-J-AUTOM-REASON = "Journal of Automated Reasoning"} @String{j-J-COMP-SECUR = "Journal of Computer Security"} @String{j-J-COMPUT-BIOL = "Journal of Computational Biology"} @String{j-J-COMPUT-CHEM = "Journal of Computational Chemistry"} @String{j-J-COMPUT-PHYS = "Journal of Computational Physics"} @String{j-J-COMPUT-SCI = "Journal of Computational Science"} @String{j-J-GRAPHICS-TOOLS = "Journal of Graphics Tools: JGT"} @String{j-J-GRID-COMP = "Journal of Grid Computing"} @String{j-J-INFO-SEC-APPL = "Journal of Information Security and Applications (JISA)"} @String{j-J-OPEN-SOURCE-SOFT = "Journal of Open Source Software"} @String{j-J-PAR-DIST-COMP = "Journal of Parallel and Distributed Computing"} @String{j-J-STAT-SOFT = "Journal of Statistical Software"} @String{j-J-SUPERCOMPUTING = "The Journal of Supercomputing"} @String{j-J-SYMBOLIC-COMP = "Journal of Symbolic Computation"} @String{j-J-SYST-SOFTW = "The Journal of Systems and Software"} @String{j-J-UCS = "J.UCS: Journal of Universal Computer Science"} @String{j-JAVA-REPORT = "{Java} Report: The Source for {Java} Development"} @String{j-JAVAWORLD = "JavaWorld: IDG's magazine for the Java community"} @String{j-JERIC = "ACM Journal on Educational Resources in Computing (JERIC)"} @String{j-JETC = "ACM Journal on Emerging Technologies in Computing Systems (JETC)"} @String{j-LECT-NOTES-COMP-SCI = "Lecture Notes in Computer Science"} @String{j-LINUX-J = "Linux Journal"} @String{j-LOGIN = ";login: the USENIX Association newsletter"} @String{j-MATH-COMPUT-APPL = "Mathematical and Computational Applications"} @String{j-MICROPROC-MICROSYS = "Microprocessors and Microsystems"} @String{j-MICROSOFT-SYS-J = "Microsoft Systems Journal"} @String{j-NORDIC-J-COMPUT = "Nordic Journal of Computing"} @String{j-NUMER-ALGORITHMS = "Numerical Algorithms"} @String{j-ONLINE-CDROM-REV = "Online \& CDROM review: the international journal of online \& optical information systems"} @String{j-OPEN-SYSTEMS-TODAY = "Open Systems Today"} @String{j-OPER-SYS-REV = "Operating Systems Review"} @String{j-PACMPL = "Proceedings of the ACM on Programming Languages (PACMPL)"} @String{j-PARALLEL-COMPUTING = "Parallel Computing"} @String{j-PARALLEL-DIST-COMP-PRACT = "Parallel and Distributed Computing Practices"} @String{j-PARALLEL-PROCESS-LETT = "Parallel Processing Letters"} @String{j-POMACS = "Proceedings of the ACM on Measurement and Analysis of Computing Systems (POMACS)"} @String{j-PROC-REAL-TIME-SYS-SYMP = "Proceedings --- Real-Time Systems Symposium"} @String{j-PROC-VLDB-ENDOWMENT = "Proceedings of the VLDB Endowment"} @String{j-QUEUE = "ACM Queue: Tomorrow's Computing Today"} @String{j-REAL-TIME-SYST = "Real-Time Systems"} @String{j-SCI-COMPUT-PROGRAM = "Science of Computer Programming"} @String{j-SCI-PROG = "Scientific Programming"} @String{j-SCPE = "Scalable Computing: Practice and Experience"} @String{j-SIAM-J-COMPUT = "SIAM Journal on Computing"} @String{j-SIAM-J-SCI-COMP = "SIAM Journal on Scientific Computing"} @String{j-SIGADA-LETTERS = "ACM SIGADA Ada Letters"} @String{j-SIGAPP = "ACM SIGAPP Applied Computing Review"} @String{j-SIGCSE = "SIGCSE Bulletin (ACM Special Interest Group on Computer Science Education)"} @String{j-SIGMETRICS = "ACM SIGMETRICS Performance Evaluation Review"} @String{j-SIGMICRO = "ACM SIGMICRO Newsletter"} @String{j-SIGMOD = "SIGMOD Record (ACM Special Interest Group on Management of Data)"} @String{j-SIGPLAN = "ACM SIG{\-}PLAN Notices"} @String{j-SIGSOFT = "ACM SIGSOFT Software Engineering Notes"} @String{j-SPE = "Soft{\-}ware\emdash Prac{\-}tice and Experience"} @String{j-SUPERCOMPUTER = "Supercomputer"} @String{j-TACO = "ACM Transactions on Architecture and Code Optimization"} @String{j-TCBB = "IEEE/ACM Transactions on Computational Biology and Bioinformatics"} @String{j-TECS = "ACM Transactions on Embedded Computing Systems"} @String{j-THEOR-COMP-SCI = "Theoretical Computer Science"} @String{j-TISSEC = "ACM Transactions on Information and System Security"} @String{j-TIST = "ACM Transactions on Intelligent Systems and Technology (TIST)"} @String{j-TKDD = "ACM Transactions on Knowledge Discovery from Data (TKDD)"} @String{j-TOCHI = "ACM Transactions on Computer-Human Interaction"} @String{j-TOCS = "ACM Transactions on Computer Systems"} @String{j-TOCL = "ACM Transactions on Computational Logic"} @String{j-TODAES = "ACM Transactions on Design Automation of Electronic Systems."} @String{j-TODS = "ACM Transactions on Database Systems"} @String{j-TOG = "ACM Transactions on Graphics"} @String{j-TOIS = "ACM Transactions on Information Systems"} @String{j-TOMACS = "ACM Transactions on Modeling and Computer Simulation"} @String{j-TOMPECS = "ACM Transactions on Modeling and Performance Evaluation of Computing Systems (TOMPECS)"} @String{j-TOMS = "ACM Transactions on Mathematical Software"} @String{j-TOPC = "ACM Transactions on Parallel Computing (TOPC)"} @String{j-TOPLAS = "ACM Transactions on Programming Languages and Systems"} @String{j-TOSEM = "ACM Transactions on Software Engineering and Methodology"} @String{j-TRETS = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)"} @String{j-UNIX-REVIEW = "UNIX review"} @String{j-UNIXWORLD-OPEN-COMP = "UnixWorld's Open Computing"} @String{j-VLDB-J = "VLDB Journal: Very Large Data Bases"} @String{j-WEB-TECHNIQUES = "Web Techniques"} @String{j-X-RESOURCE = "{The X Resource}"} %%% ==================================================================== %%% Publisher abbreviations: @String{pub-ACM = "ACM Press"} @String{pub-ACM:adr = "New York, NY 10036, USA"} @String{pub-AP = "Academic Press"} @String{pub-AP:adr = "New York, USA"} @String{pub-APRESS = "Apress"} @String{pub-APRESS:adr = "Berkeley, CA, USA"} @String{pub-AW = "Ad{\-d}i{\-s}on-Wes{\-l}ey"} @String{pub-AW:adr = "Reading, MA, USA"} @String{pub-AWDP = "Ad{\-d}i{\-s}on-Wes{\-l}ey Developers Press"} @String{pub-AWDP:adr = "Reading, MA, USA"} @String{pub-EYROLLES = "Editions Eyrolles"} @String{pub-EYROLLES:adr = "Paris, France"} @String{pub-HERMES = "Hermes"} @String{pub-HERMES:adr = "Paris, France"} @String{pub-IEEE = "IEEE Computer Society Press"} @String{pub-IEEE:adr = "1109 Spring Street, Suite 300, Silver Spring, MD 20910, USA"} @String{pub-KLUWER = "Kluwer Academic Publishers"} @String{pub-KLUWER:adr = "Dordrecht, The Netherlands; Boston, MA, USA"} @String{pub-LEARNED-INF = "Learned Information"} @String{pub-LEARNED-INF:adr = "Medford, NJ, USA"} @String{pub-MCGRAW-HILL = "Mc{\-}Graw-Hill"} @String{pub-MCGRAW-HILL:adr = "New York, NY, USA"} @String{pub-MIT = "MIT Press"} @String{pub-MIT:adr = "Cambridge, MA, USA"} @String{pub-MORGAN-KAUFMANN = "Morgan Kaufmann Publishers"} @String{pub-MORGAN-KAUFMANN:adr = "Los Altos, CA 94022, USA"} @String{pub-MORGAN-KAUFMANN:adrnew = "2929 Campus Drive, Suite 260, San Mateo, CA 94403, USA"} @String{pub-NO-STARCH = "No Starch Press"} @String{pub-NO-STARCH:adr = "San Francisco, CA, USA"} @String{pub-NTIS = "National Technical Information Service"} @String{pub-NTIS:adr = "Washington, DC, USA"} @String{pub-ORA = "O'Reilly \& Associates, Inc."} @String{pub-ORA:adr = "981 Chestnut Street, Newton, MA 02164, USA"} @String{pub-ORA-MEDIA = "O'Reilly Media, Inc."} @String{pub-ORA-MEDIA:adr = "1005 Gravenstein Highway North, Sebastopol, CA 95472, USA"} @String{pub-PACKT = "Packt Publishing"} @String{pub-PACKT:adr = "Birmingham, UK"} @String{pub-PH = "Pren{\-}tice-Hall"} @String{pub-PH:adr = "Englewood Cliffs, NJ 07632, USA"} @String{pub-PHI = "Pren{\-}tice-Hall International"} @String{pub-PHI:adr = "Englewood Cliffs, NJ 07632, USA"} @String{pub-PHPTR = "P T R Pren{\-}tice-Hall"} @String{pub-PHPTR:adr = "Englewood Cliffs, NJ 07632, USA"} @String{pub-SAMS = "Howard W. Sams"} @String{pub-SAMS:adr = "Indianapolis, IN 46268, USA"} @String{pub-SUN = "Sun Microsystems"} @String{pub-SUN:adr = "2550 Garcia Avenue, Mountain View, CA 94043, USA"} @String{pub-SUN-MICROSYSTEMS-PRESS = "Sun Microsystems Press"} @String{pub-SUN-MICROSYSTEMS-PRESS:adr = "Palo Alto, CA, USA"} @String{pub-SUNSOFT = "SunSoft Press"} @String{pub-SUNSOFT:adr = "Mountainview, CA, USA"} @String{pub-SV = "Spring{\-}er-Ver{\-}lag"} @String{pub-SV:adr = "Berlin, Germany~/ Heidelberg, Germany~/ London, UK~/ etc."} @String{pub-UKUUG = "UK Unix Users Group"} @String{pub-UKUUG:adr = "Buntingford, Herts, UK"} @String{pub-USENIX = "USENIX Association"} @String{pub-USENIX:adr = "Berkeley, CA, USA"} @String{pub-WILEY = "John Wiley and Sons"} @String{pub-WILEY:adr = "New York, NY, USA; London, UK; Sydney, Australia"} @String{pub-WORLD-SCI = "World Scientific Publishing Co."} @String{pub-WORLD-SCI:adr = "Singapore; Philadelphia, PA, USA; River Edge, NJ, USA"} %%% ==================================================================== %%% Series abbreviations: @String{ser-LNCS = "Lecture Notes in Computer Science"} %%% ==================================================================== %%% Bibliography entries, sorted by year, and then by citation label, %%% with ``bibsort -byyear'': @Article{Bettcher:1973:TSR, author = "C. W. Bettcher", title = "Thread standardization and relative cost", journal = j-COMP-ARCH-NEWS, volume = "2", number = "1", pages = "9--9", month = jan, year = "1973", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:28 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", remark = "This is a reprint of an article published in the {\em Journal of the Society of Automotive Engineers}, Volume XVIII, Number 2, p. 131, February 1926, about the cost of the lack of standardization of screw threads. {\em Computer Architecture News\/} Editor-in-Chief Caxton C. Foster has added a hand-written note ``of course, there is no message here for {\em us}.''", } @TechReport{Fraser:1979:CLR, author = "A. G. Fraser", title = "{C} Language Routines for Multi-Thread Computations", type = "Technical Memorandum", number = "1388 (TM 79-1273-4)", institution = inst-ATT-BELL, address = inst-ATT-BELL:adr, pages = "??", day = "7", month = may, year = "1979", bibdate = "Tue Jun 06 08:07:45 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/unix.bib", abstract = "This memorandum describes subroutines which provide the basic mechanisms needed to support multiple tasks within one C language program.", acknowledgement = ack-nhfb, author-dates = "Alexander G. (Sandy) Fraser (8 June 1937--13 June 2022)", } @Article{Smith:1980:ASD, author = "Connie Smith and J. C. Browne", title = "Aspects of software design analysis: {Concurrency} and blocking", journal = j-SIGMETRICS, volume = "9", number = "2", pages = "245--253", month = "Summer", year = "1980", CODEN = "????", DOI = "https://doi.org/10.1145/1009375.806169", ISSN = "0163-5999 (print), 1557-9484 (electronic)", ISSN-L = "0163-5999", bibdate = "Thu Jun 26 10:54:53 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This paper extends previous work on development of a methodology for the prediction of the performance of computer software systems from design level specifications and continuing through implementation. The effects of synchronized behavior, such as results from data reservation in multi-thread executions of data base systems, and competition for host system resources are incorporated. The previous methodology uses hierarchical graphs to represent the execution of software on some host computer system (or on some abstract machine). Performance metrics such as response time were obtained from analysis of these graphs assuming execution of a single copy on a dedicated host. This paper discusses the mapping of these execution graphs upon queueing network models of the host computing environment to yield performance metric estimates for more complex and realistic processing environments.", acknowledgement = ack-nhfb, fjournal = "ACM SIGMETRICS Performance Evaluation Review", journal-URL = "http://portal.acm.org/toc.cfm?id=J618", } @Article{Jonak:1986:EFL, author = "J. E. Jonak", title = "Experience with a {FORTH}-like language", journal = j-SIGPLAN, volume = "21", number = "2", pages = "27--36", month = feb, year = "1986", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:14:55 MST 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classification = "C6110 (Systems analysis and programming); C6140D (High level languages)", corpsource = "Sperry Network Syst., London, UK", fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "FORTH; languages; programming; threaded code language", pubcountry = "USA A03", subject = "D.3.2 Software, PROGRAMMING LANGUAGES, Language Classifications, FORTH", treatment = "P Practical", } @Book{McJones:1987:EUS, author = "Paul R. McJones and Garret Frederick Swart", title = "Evolving the {UNIX} system interface to support multithreaded programs: The {Topaz Operating System} programmer's manual", volume = "21", publisher = "Digital Systems Research Center", address = "Palo Alto, CA, USA", pages = "100", day = "28", month = sep, year = "1987", LCCN = "QA76.76.O63M42 1987", bibdate = "Fri Aug 7 08:29:38 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = "Systems Research Center", acknowledgement = ack-nhfb, keywords = "computer networks; Computer networks; electronic data processing -- distributed processing; Electronic data processing -- Distributed processing; multithreaded operating system interface -- Topaz operating; Operating systems (Computers); operating systems (computers); system; UNIX (computer file); UNIX (Computer operating system)", } @Article{Tanner:1987:MTI, author = "P. P. Tanner", title = "Multi-thread input", journal = j-COMP-GRAPHICS, volume = "21", number = "2", pages = "142--145", month = apr, year = "1987", CODEN = "CGRADI, CPGPBZ", ISSN = "0097-8930 (print), 1558-4569 (electronic)", ISSN-L = "0097-8930", bibdate = "Tue Mar 12 17:52:38 MST 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Computer Graphics", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J166", } @Article{Gilbert:1988:DVN, author = "P. D. Gilbert", title = "Development of the {VAX NOTES} system", journal = j-DEC-TECH-J, volume = "1", number = "6", pages = "117--124", month = feb, year = "1988", CODEN = "DTJOEL", ISSN = "0898-901X", bibdate = "Thu Mar 20 18:15:43 MST 1997", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classcodes = "C6110B (Software engineering techniques); C7410F (Communications)", corpsource = "Digital Equipment Corp., Hudson, MA, USA", fjournal = "Digital Technical Journal", keywords = "callable interface; communications tool; computer conferencing; DEC; DEC computers; discussions; human factors; human-factors engineering; interfaces; medium; multiprogramming; multitasking; multithreaded server; online; program; program testing; software engineering; storage; technical writer; teleconferencing; testing; user; user interface; VAX NOTES", treatment = "P Practical", } @Article{Halstead:1988:MMP, author = "R. H. {Halstead, Jr.} and T. Fujita", title = "{MASA}: a multithreaded processor architecture for parallel symbolic computing", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "443--451", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @TechReport{Agarwal:1989:PTM, author = "Anant Agarwal", title = "Performance tradeoffs in multithreaded processors", number = "89-566", institution = "Massachusetts Institute of Technology, Microsystems Program Office", address = "Cambridge, MA, USA", pages = "30", year = "1989", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = "VLSI memo", acknowledgement = ack-nhfb, } @Article{Amamiya:1989:DFC, author = "M. Amamiya", title = "Data Flow Computing and Parallel Reduction Machine", journal = j-FUT-GEN-COMP-SYS, volume = "4", number = "??", pages = "53--67", month = "????", year = "1989", CODEN = "FGSEVI", ISSN = "0167-739X (print), 1872-7115 (electronic)", ISSN-L = "0167-739X", bibdate = "Wed Feb 27 18:37:19 2002", bibsource = "ftp://ftp.ira.uka.de/bibliography/Compiler/Functional.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", fjournal = "Future Generation Computer Systems", journal-URL = "http://www.sciencedirect.com/science/journal/0167739X", keywords = "functional cell toke flow multi-thread control flow architecture", } @TechReport{Birrell:1989:IPT, author = "Andrew D. Birrell", title = "An introduction to programming with threads", type = "SRC reports", number = "35", institution = "Digital Systems Research Center", address = "Palo Alto, CA, USA", pages = "35", day = "6", month = jan, year = "1989", LCCN = "QA76.6.B5729 1989", bibdate = "Fri May 10 12:18:17 MDT 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "parallel programming (computer science); synchronization", } @Article{Briot:1989:OAS, author = "Jean-Pierre Briot", title = "From objects to actors: study of a limited symbiosis in {Smalltalk-80}", journal = j-SIGPLAN, volume = "24", number = "4", pages = "69--72", month = apr, year = "1989", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:15:37 MST 2003", bibsource = "Compendex database; http://portal.acm.org/; http://www.acm.org/pubs/toc/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/proceedings/plan/67386/p69-briot/", abstract = "In this paper we describe an implementation of actors in Smalltalk-80, named Actalk. This attempt is designed as a minimal extension preserving the Smalltalk-80 language. Actors are active and autonomous objects, as opposed to standard passive Smalltalk-80 objects. An actor is built from a standard Smalltalk-80 object by associating a process with it and by serializing the messages it could receive into a queue. We will study the cohabitation and synergy between the two models of computations: transfer of active messages (message and thread of activity) between passive objects, and exchange of passive messages between active objects. We propose a sketch of methodology in order to have a safe combination between these two programming paradigms.", acknowledgement = ack-nhfb, affiliation = "Univ Paris VI", affiliationaddress = "Paris, Fr", classification = "723", conference = "Proceedings of the ACM SIGPLAN Workshop on Object-Based Concurrent Programming", confname = "Proceedings of the ACM SIGPLAN workshop on Object-based concurrent programming, September 26--27 1988, San Diego, CA", fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", journalabr = "SIGPLAN Not", keywords = "Actor Based Systems; Computer Metatheory--Programming Theory; Computer Programming Languages; Concurrent Programming; Design; design; languages; Object-Based Programming; Smalltalk-80", meetingaddress = "San Diego, CA, USA", meetingdate = "Sep 26--27 1988", meetingdate2 = "09/26--27/88", subject = "{\bf D.3.2} Software, PROGRAMMING LANGUAGES, Language Classifications, Smalltalk-80. {\bf D.1.3} Software, PROGRAMMING TECHNIQUES, Concurrent Programming. {\bf D.4.1} Software, OPERATING SYSTEMS, Process Management, Concurrency.", } @Article{Caromel:1989:GMC, author = "Denis Caromel", title = "A general model for concurrent and distributed object-oriented programming", journal = j-SIGPLAN, volume = "24", number = "4", pages = "102--104", month = apr, year = "1989", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:15:37 MST 2003", bibsource = "Compendex database; http://portal.acm.org/; http://www.acm.org/pubs/toc/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/proceedings/plan/67386/p102-caromel/", abstract = "This paper presents a general model supporting object-oriented programming in concurrent as well as distributed environments. The model combines the advantages of remote procedure calls with those of message passing. It relies on the following concepts: All objects are not active but the active entities are objects, Asynchronous Message Passing with Data-driven synchronization, and Service mechanism allowing an explicit thread of control.", acknowledgement = ack-nhfb, affiliation = "CNRS", affiliationaddress = "Vandoeuvres-les-Nancy, Fr", classification = "722; 723", conference = "Proceedings of the ACM SIGPLAN Workshop on Object-Based Concurrent Programming", confname = "Proceedings of the ACM SIGPLAN workshop on Object-based concurrent programming, September 26--27 1988, San Diego, CA", fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", journalabr = "SIGPLAN Not", keywords = "Computer Systems Programming; Computer Systems, Digital--Distributed; Concurrent Programming; design; Multiprocessing Programs; Object-Oriented Programming", meetingaddress = "San Diego, CA, USA", meetingdate = "Sep 26--27 1988", meetingdate2 = "09/26--27/88", subject = "{\bf D.1.3} Software, PROGRAMMING TECHNIQUES, Concurrent Programming. {\bf D.1.m} Software, PROGRAMMING TECHNIQUES, Miscellaneous. {\bf D.4.7} Software, OPERATING SYSTEMS, Organization and Design, Distributed systems. {\bf D.4.1} Software, OPERATING SYSTEMS, Process Management, Concurrency.", } @MastersThesis{CarrerasVaquer:1989:APE, author = "Carlos {Carreras Vaquer}", title = "Architecture and performance evaluation of a multithreaded cache design", type = "Thesis ({M.S. in Engineering})", school = "University of Texas at Austin", address = "Austin, TX, USA", pages = "xii + 108", year = "1989", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Cache memory; Computer architecture; Computer storage devices; Integrated circuits -- Very large scale integration; Microprocessors", } @TechReport{Caswell:1989:IMD, author = "Deborah L. Caswell and David L. Black", title = "Implementing a {Mach} debugger for multithreaded applications", type = "Research paper", number = "CMU-CS-89-154", institution = "Carnegie Mellon University, Computer Science Dept.", address = "Pittsburgh, PA, USA", pages = "13", month = nov, year = "1989", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "To appear in the Conference Proceedings of Winter 1990 USENIX Technical Conference and Exhibition, Washington, DC, January, 1990.", abstract = "Multiple threads of control add new challenges to the task of application debugging, and require the development of new debuggers to meet these challenges. This paper describes the design and implementation of modifications to an existing debugger (gdb) for debugging multithreaded applications under the Mach operating system. It also describes the operating system facilities that support it. Although certain implementation details are specific to Mach, the underlying design principles are applicable to other systems that support threads in a Unix compatible environment.", acknowledgement = ack-nhfb, annote = "Supported by the Space and Naval Warfare Systems Command.", keywords = "Debugging in computer science -- Computer programs", } @InProceedings{Korty:1989:SLL, author = "Joseph A. Korty", title = "{Sema}: a {Lint-like} Tool for Analyzing Semaphore Usage in a Multithreaded {UNIX} Kernel", crossref = "USENIX:1989:PWU", institution = "MODCOMP", pages = "113--123", month = "Winter", year = "1989", bibdate = "Wed Aug 13 10:48:45 MDT 1997", bibsource = "ftp://ftp.uu.net/library/bibliography; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/usenix1980.bib; http://www.usenix.org/cgi-bin/sortbib.pl?-sA", acknowledgement = ack-nhfb, affiliation = "MODCOMP", } @Article{Massalin:1989:TIO, author = "H. Massalin and C. Pu", title = "Threads and input\slash output in the synthesis kernel", journal = j-OPER-SYS-REV, volume = "23", number = "5", pages = "191--201", month = dec, year = "1989", CODEN = "OSRED8", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 12:47:29 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @InProceedings{McJones:1989:EUS, author = "Paul R. McJones and Garret F. Swart", title = "Evolving the {UNIX} System Interface to Support Multithreaded Programs", crossref = "USENIX:1989:PWU", pages = "393--404", month = "Winter", year = "1989", bibdate = "Fri Oct 18 07:24:24 MDT 1996", bibsource = "ftp://ftp.uu.net/library/bibliography; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, affiliation = "DEC Systems Research Center", } @MastersThesis{Plyler:1989:AMC, author = "Kevin Brian Plyler", title = "Adding multithreaded capabilities to the process manager of the {BIGSAM} distributed operating system", type = "Thesis ({M.S.})", school = "Arizona State University", address = "Tempe, AZ, USA", pages = "x + 105 + 2", year = "1989", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Electronic data processing -- Distributed processing; Multiprocessors; Operating systems (Computers)", } @InProceedings{Rashid:1989:MFO, author = "R. Rashid and R. Baron and A. Forin and D. Golub and M. Jones and D. Orr and R. Sanzi", title = "{Mach}: a foundation for open systems (operating systems)", crossref = "IEEE:1989:WOS", pages = "109--113", year = "1989", bibdate = "Sat Sep 28 20:21:01 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/mach.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, affiliation = "Sch. of Comput. Sci., Carnegie-Mellon Univ., Pittsburgh, PA, USA", classification = "C6110B (Software engineering techniques); C6150J (Operating systems)", keywords = "Hardware resources; Mach kernel; Multiserver Unix; Multithreaded Unix server; Operating system; OS emulation; Software development", thesaurus = "File servers; Open systems; Operating systems [computers]; Software engineering; Unix", } @Article{Schonberg:1989:FDA, author = "Edith Schonberg", title = "On-the-fly detection of access anomalies", journal = j-SIGPLAN, volume = "24", number = "7", pages = "285--297", month = jul, year = "1989", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:15:41 MST 2003", bibsource = "http://www.acm.org/pubs/contents/proceedings/pldi/73141/index.html; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/proceedings/pldi/73141/p285-schonberg/", abstract = "Access anomalies are a common class of bugs in shared-memory parallel programs. An access anomaly occurs when two concurrent execution threads both write (or one thread reads and the other writes) the same shared memory location without coordination. Approaches to the detection of access anomalies include static analysis, post-mortem trace analysis, and on-the-fly monitoring. A general on-the-fly algorithm for access anomaly detection is presented, which can be applied to programs with both nested fork-join and synchronization operations. The advantage of on-the-fly detection over post-mortem analysis is that the amount of storage used can be greatly reduced by data compression techniques and by discarding information as soon as it becomes obsolete. In the algorithm presented, the amount of storage required at any time depends only on the number V of shared variables being monitored and the number N of threads, not on the number of synchronizations. Data compression is achieved by the use of two techniques called merging and subtraction. Upper bounds on storage are shown to be V \$MUL N${}^2$ for merging and V \$MUL N for subtraction.", acknowledgement = ack-nhfb, affiliationaddress = "New York, NY, USA", annote = "Published as part of the Proceedings of PLDI'89.", classification = "722; 723", conference = "Proceedings of the SIGPLAN '89 Conference on Programming Language Design and Implementation", fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", journalabr = "SIGPLAN Not", keywords = "Access Anomalies; algorithms; Computer Operating Systems; Computer Programming Languages--Design; Computer Systems, Digital--Parallel Processing; languages; Parallel Programs; Program Processors", meetingaddress = "Portland, OR, USA", meetingdate = "Jun 21--23 1989", meetingdate2 = "06/21--23/89", sponsor = "ACM, Special Interest Group on Programming Languages, New York; SS NY, USA", subject = "{\bf D.1.3} Software, PROGRAMMING TECHNIQUES, Concurrent Programming. {\bf D.3.2} Software, PROGRAMMING LANGUAGES, Language Classifications, Ada. {\bf D.2.2} Software, SOFTWARE ENGINEERING, Design Tools and Techniques, Flow charts.", } @InProceedings{Caswell:1990:IMD, author = "D. Caswell and D. Black", title = "Implementing a {Mach} debugger for multithreaded applications", crossref = "Anonymous:1990:PWU", pages = "25--39", year = "1990", bibdate = "Sat Sep 28 20:03:34 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, affiliation = "Hewlett Packard Labs., Palo Alto, CA, USA", classification = "C6150G (Diagnostic, testing, debugging and evaluating systems); C6150J (Operating systems)", keywords = "Application debugging; Mach debugger; Mach operating system; Multithreaded applications; Operating system facilities; Underlying design principles; Unix compatible environment", thesaurus = "Operating systems [computers]; Program debugging; Unix", } @Article{Colvin:1990:CTS, author = "Gregory Colvin", title = "{CUG306} Thread and Synapsys", journal = j-CUJ, volume = "8", type = "CUG New Release", number = "3", pages = "131--??", month = mar, year = "1990", ISSN = "0898-9788", bibdate = "Fri Aug 30 16:52:23 MDT 1996", bibsource = "http://www.cuj.com/cbklist.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "C Users Journal", } @Article{Colvin:1990:MLT, author = "Gregory Colvin", title = "Multitasking With Lightweight Threads", journal = j-CUJ, volume = "8", number = "3", pages = "55--??", month = mar, year = "1990", ISSN = "0898-9788", bibdate = "Fri Aug 30 16:52:23 MDT 1996", bibsource = "http://www.cuj.com/cbklist.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "C Users Journal", } @Article{Eggers:1990:TEI, author = "S. J. Eggers and David R. Keppel and Eric J. Koldinger and Henry M. Levy", title = "Techniques for efficient inline tracing on a shared-memory multiprocessor", journal = j-SIGMETRICS, volume = "18", number = "1", pages = "37--47", month = may, year = "1990", CODEN = "????", DOI = "https://doi.org/10.1145/98457.98501", ISSN = "0163-5999 (print), 1557-9484 (electronic)", ISSN-L = "0163-5999", bibdate = "Thu Jun 26 11:09:08 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "While much current research concerns multiprocessor design, few traces of parallel programs are available for analyzing the effect of design trade-offs. Existing trace collection methods have serious drawbacks: trap-driven methods often slow down program execution by more than 1000 times, significantly perturbing program behavior; microcode modification is faster, but the technique is neither general nor portable. This paper describes a new tool, called MPTRACE, for collecting traces of multithreaded parallel programs executing on shared-memory multiprocessors. MPTRACE requires no hardware or microcode modification; it collects complete program traces; it is portable; and it reduces execution-time dilation to less than a factor 3. MPTRACE is based on inline tracing, in which a program is automatically modified to produce trace information as it executes. We show how the use of compiler flow analysis techniques can reduce the amount of data collected and therefore the runtime dilation of the traced program. We also discuss problematic issues concerning buffering and writing of trace data on a multiprocessor.", acknowledgement = ack-nhfb, fjournal = "ACM SIGMETRICS Performance Evaluation Review", journal-URL = "http://portal.acm.org/toc.cfm?id=J618", } @Article{Faust:1990:POO, author = "John E. Faust and Henry M. Levy", title = "The performance of an object-oriented threads package", journal = j-SIGPLAN, volume = "25", number = "10", pages = "278--288", month = oct, year = "1990", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:15:57 MST 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Gonzalez:1990:MSC, author = "Dean W. Gonzalez", title = "Multitasking Software Components", journal = j-SIGADA-LETTERS, volume = "10", number = "1", pages = "92--96", month = jan # "\slash " # feb, year = "1990", CODEN = "AALEE5", ISSN = "1094-3641 (print), 1557-9476 (electronic)", ISSN-L = "1094-3641", bibdate = "Thu Sep 28 07:33:23 MDT 2000", bibsource = "ftp://ftp.uu.net/library/bibliography; http://www.adahome.com/Resources/Bibliography/articles.ref; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classcodes = "C6110B (Software engineering techniques); C6120 (File organisation)", fjournal = "ACM SIGADA Ada Letters", keywords = "Ada; Ada parameter passing semantics; concurrency, tasking, reuse; concurrent forms; data integrity; data structure manipulation routines; data structures; multiple; parallel programming; reusability; semaphore calls; software; threads of control", treatment = "P Practical", } @InProceedings{Hansen:1990:EPA, author = "G. J. Hansen and C. A. Linthicum and G. Brooks", title = "Experience with a performance analyzer for multithreaded applications", crossref = "IEEE:1990:PSN", pages = "124--131", year = "1990", bibdate = "Wed Apr 15 18:34:48 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classification = "C5470 (Performance evaluation and testing); C6150E (General utility programs); C6150G (Diagnostic, testing, debugging and evaluating systems)", corpsource = "CONVEX Comput. Corp., Richardson, TX, USA", keywords = "CONVEX C200 series computers; Convex OS V8.0; CONVEX performance analyzer, CX/sub pa/; loops; multiprocessing systems; multithreaded applications; operating system facilities; parallel code monitoring; performance evaluation; profiling data; profiling information; time-sharing environment; time-sharing systems; Unix; UNIX based operating system", sponsororg = "IEEE; ACM; Lawrence Livermore Nat. Lab.; Los Alamos Nat. Lab.; NASA Ames Res. Center; Nat. Center Atmos. Res.; NSF; SIAM; Supercomput. Res. Center", treatment = "P Practical; X Experimental", } @Article{Miastkowski:1990:PGG, author = "Stan Miastkowski", title = "{PC GUIs} Go Head to Head", journal = j-BYTE, volume = "15", number = "11", pages = "82--87", month = "Fall", year = "1990", CODEN = "BYTEDJ", ISSN = "0360-5280 (print), 1082-7838 (electronic)", ISSN-L = "0360-5280", bibdate = "Thu Sep 12 18:39:30 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classification = "C6130B (Graphics techniques); C6150J (Operating systems); C6180 (User interfaces)", fjournal = "BYTE Magazine", keywords = "Graphical DOS shell; Multithreading operating system; OS/2; PC GUIs; User interface differences; Windows 3.0", thesaurus = "Computer graphics; Operating systems [computers]; User interfaces", } @Article{Nordstrom:1990:TL, author = "D. J. Nordstrom", title = "Threading {Lisp}", journal = j-SIGPLAN, volume = "25", number = "2", pages = "17--24", month = feb, year = "1990", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:15:50 MST 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @TechReport{Polychronopoulos:1990:ASC, author = "C. D. (Constantine D.) Polychronopoulos", title = "Auto scheduling: control flow and data flow come together", type = "Technical Report", number = "CSRD 1058", institution = inst-UIUC-CSRD, address = inst-UIUC-CSRD:adr, pages = "28", month = dec, year = "1990", bibdate = "Fri Aug 30 08:01:51 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This paper presents a framework we term auto-scheduling, which brings together the control flow and data flow models by combining most of the advantages and excluding the major disadvantages of the two familiar models. Auto-scheduling can be viewed either as an abstract architectural model or as a parallel program compilation framework. While in ordinary environments parallel task creation and scheduling is done by the operating system, or at best the run-time library, in auto-scheduling task creation and scheduling is performed by the user program itself, making parallel processing affordable at fine-granularity levels. Under auto-scheduling the compiler does not only generate object code, but it `lends' its knowledge about a program to the parallel instruction threads of that program, allowing them to manage, activate, and schedule themselves at run-time, without the need of an external monitor. This is done by means of special drive-code injected by the compiler to each schedulable unit of a program (task, thread, etc). We argue that auto-scheduling offers an optimal approach for exploiting parallelism on real parallel computer systems.", acknowledgement = ack-nhfb, annote = "Title on P. 1: Auto-scheduling: control flow and data flow come together. Supported in part by the National Science Foundation. Supported in part by the U.S. Department of Energy. Supported in part by Digital Equipment Corporation.", keywords = "Parallel processing (Electronic computers); Scheduling (Management)", } @InProceedings{Presotto:1990:MSP, author = "D. L. Presotto", booktitle = "UKUUG. UNIX - The Legend Evolves. Proceedings of the Summer 1990 UKUUG Conference", title = "Multiprocessor Streams for {Plan 9}", publisher = pub-UKUUG, address = pub-UKUUG:adr, pages = "11--19 (of xi + 260)", month = "????", year = "1990", ISBN = "0-9513181-7-9", ISBN-13 = "978-0-9513181-7-1", LCCN = "????", bibdate = "Sat Mar 22 15:10:17 MST 1997", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classcodes = "C6150J (Operating systems)", conflocation = "London, UK; 9-13 July 1990", corpsource = "AT&T Bell Lab., Murray Hill, NJ, USA", keywords = "abstraction; input-output programs; kernel; multi-threaded; multiprocessing programs; multiprocessor; Plan 9 kernel; Streams; system call interface; Unix", treatment = "P Practical", } @TechReport{Saavedra-Barrera:1990:AMA, author = "Rafael H. Saavedra-Barrera and David E. Culler and Thorsten {Von Eiken}", title = "Analysis of multithreaded architectures for parallel computing", type = "Report", number = "UCB/CSD 90/569", institution = "University of California, Berkeley, Computer Science Division", address = "Berkeley, CA, USA", pages = "10", month = apr, year = "1990", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "To appear in the 2nd Annual ACM Symposium on Parallel Algorithms and Architectures, Crete, Greece, July 1990.", abstract = "Multithreading has been proposed as an architectural strategy for tolerating latency in multiprocessors and, through limited empirical studies, shown to offer promise. This paper develops an analytical model of multithreaded processor behavior based on a small set of architectural and program parameters. The model gives rise to a large Markov chain, which is solved to obtain a formula for processor efficiency in terms of the number of threads per processor, the remote reference rate, the latency, and the cost of switching between threads. It is shown that a multithreaded processor exhibits three operating regimes: linear (efficiency is proportional to the number of threads), transition, and saturation (efficiency depends only on the remote reference rate and switch cost). Formulae for regime boundaries are derived. The model is embellished to reflect cache degradation due to multithreading, using an analytical model of cache behavior, demonstrating that returns diminish as the number threads becomes large. Predictions from the embellished model correlate well with published empirical measurements. Prescriptive use of the model under various scenarios indicates that multithreading is effective, but the number of useful threads per processor is fairly small.", acknowledgement = ack-nhfb, annote = "Supported in part by NASA. Supported in part by the National Science Foundation through the UCB Mammoth project.", keywords = "Computer architecture; Multiprocessors", } @Article{Schmitt:1990:CEM, author = "David A. Schmitt", title = "{C} Extensions For Multi-Threading", journal = j-CUJ, volume = "8", number = "8", pages = "33--??", month = aug, year = "1990", ISSN = "0898-9788", bibdate = "Fri Aug 30 16:52:23 MDT 1996", bibsource = "http://www.cuj.com/cbklist.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "C Users Journal", } @MastersThesis{Stapleton:1990:DSS, author = "Joseph Francis Stapleton", title = "Dynamic server selection in a multithreaded network computing environment", type = "Thesis ({M.S.})", school = "Iowa State University", address = "Ames, IA, USA", pages = "66", year = "1990", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @TechReport{Agarwal:1991:PTM, author = "Anant Agarwal", title = "Performance tradeoffs in multithreaded processors", type = "Technical report", number = "MIT/LCS/TR 501; VLSI memo no. 89-566", institution = "Laboratory for Computer Science, Massachusetts Institute of Technology", address = "Cambridge, MA, USA", pages = "39", year = "1991", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Balter:1991:AIG, author = "R. Balter and J. Bernadat and D. Decouchant and A. Duda and A. Freyssinet and S. Krakowiak and M. Meysembourg and P. Le Dot and H. Nguyen Van and E. Paire and M. Riveill and C. Roison and X. Rousset de Pina and R. Scioville and G. Vand{\^o}me", title = "Architecture and Implementation of Guide, an Object-Oriented Distributed System", journal = j-COMP-SYS, volume = "4", number = "1", pages = "31--67", month = "Winter", year = "1991", CODEN = "CMSYE2", ISSN = "0895-6340", bibdate = "Fri Sep 13 08:51:08 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", acknowledgement = ack-nhfb, classification = "C6110 (Systems analysis and programming); C6150J (Operating systems)", fjournal = "Computing Systems", keywords = "Class; Distributed object memory; Dynamic links; Execution structures; Execution units; Grenoble Universities integrated distributed environment; Guide; Job sharing; Language; Multi-threaded virtual machines; Nodes; Object model; Object-oriented distributed operating system; Persistent objects storage; Single inheritance; Synchronized objects; Synchronized transactions; Type; UNIX", thesaurus = "Distributed processing; Object-oriented programming; Operating systems [computers]", } @Article{Beddow:1991:MTC, author = "A. J. M. Beddow", title = "Multi-Threaded {C} Functions", journal = j-CUJ, volume = "9", number = "1", pages = "57--??", month = jan, year = "1991", ISSN = "0898-9788", bibdate = "Fri Aug 30 16:52:23 MDT 1996", bibsource = "http://www.cuj.com/cbklist.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "C Users Journal", } @InProceedings{Bolinger:1991:PSH, author = "D. Bolinger and S. Mangalat", title = "Parallelizing signal handling and process management in {OSF/1}", crossref = "USENIX:1991:PUM", pages = "105--122", year = "1991", bibdate = "Sat Sep 28 19:47:51 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/mach.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, affiliation = "Encore Computer Corp., Marlborough, MA, USA", classification = "C6110P (Parallel programming); C6150J (Operating systems)", keywords = "Mach kernel; Multi-threaded programming model; Multi-threaded tasks; Multiprocessor-efficient; OSF/1 operating system; Parallelization; Performance improvements; Process management; Races; Signal handling; Synchronization problems; System calls; Unix emulation; Unix process-oriented abstractions", thesaurus = "Interrupts; Operating systems [computers]; Parallel programming; Unix", } @Article{Canetti:1991:PCP, author = "R. Canetti and L. P. Fertig and S. A. Kravitz and D. Malki and R. Y. Pinter and S. Porat and A. Teperman", title = "The parallel {C} ({pC}) programming language", journal = j-IBM-JRD, volume = "35", number = "5/6", pages = "727--741", month = sep # "\slash " # nov, year = "1991", CODEN = "IBMJAE", ISSN = "0018-8646 (print), 2151-8556 (electronic)", ISSN-L = "0018-8646", bibdate = "Tue Mar 25 14:26:59 MST 1997", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The authors describe pC (parallel C), an extension of the ANSI C programming language to support medium- to large-grain parallel programming in both shared- and distributed-memory environments. pC aims to make programming for parallel processors accessible to the C community by enriching the C programming model with a small set of constructs supporting parallelism. pC supports shared- and distributed-memory environments via a hierarchical computational model. A pC application comprises a static collection of tasks with disjoint memory spaces. A dynamic collection of threads runs within each task, sharing the data and code of the task. Language constructs specify concurrent execution of threads within a single task. Additional language constructs specify the interactions between threads through the following mechanisms: initiation of threads in remote tasks by remote function call, mailbox-based message passing, and synchronization primitives. The paper introduces the computational model and language constructs of pC and describes a prototype pC compiler and run-time system for the Mach operating system. Several program examples illustrate the utility of pC constructs.", acknowledgement = ack-nhfb, affiliation = "Dept. of Comput. Sci., Technion-Israel Inst. of Technol., Haifa, Israel", classcodes = "C6140D (High level languages); C6110P (Parallel programming); C6150C (Compilers, interpreters and other processors)", classification = "C6110P (Parallel programming); C6140D (High level languages); C6150C (Compilers, interpreters and other processors)", corpsource = "Dept. of Comput. Sci., Technion-Israel Inst. of Technol., Haifa, Israel", fjournal = "IBM Journal of Research and Development", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5288520", keywords = "ANSI C programming language; C language; C programming; C programming model; Disjoint memory spaces; disjoint memory spaces; Distributed-memory; distributed-memory; function call; Hierarchical computational model; hierarchical computational model; Language constructs; language constructs; Mach; Mach operating system; Mailbox-based message passing; mailbox-based message passing; model; operating system; Parallel C; parallel C; parallel languages; Parallel programming; parallel programming; Parallelism; parallelism; PC; pC; PC compiler; pC compiler; program compilers; remote; Remote function call; Run-time system; run-time system; Shared memory; shared memory; Synchronization; synchronization; Tasks; tasks; Threads; threads", thesaurus = "C language; Parallel languages; Program compilers", treatment = "P Practical", } @Article{Ching:1991:EAP, author = "W.-M. Ching and D. Ju", title = "Execution of automatically parallelized {APL} programs on {RP3}", journal = j-IBM-JRD, volume = "35", number = "5/6", pages = "767--777", month = sep # "\slash " # nov, year = "1991", CODEN = "IBMJAE", ISSN = "0018-8646 (print), 2151-8556 (electronic)", ISSN-L = "0018-8646", bibdate = "Tue Mar 25 14:26:59 MST 1997", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The authors have implemented an experimental APL/C compiler, which accepts ordinary APL programs and produces C programs. They have also implemented a run-time environment that supports the parallel execution of these C programs on the RP3 computer, a shared-memory, 64-way MIMD machine built at the IBM Thomas J. Watson Research Center. The APL/C compiler uses the front end of the APL/370 compiler and imposes the same restrictions, but requires no parallelization directives from the user. The run-time environment is based on simple synchronization primitives and is implemented using Mach threads. They report the speedups of several compiled programs running on RP3 under the Mach operating system. The current implementation exploits only data parallelism. They discuss the relationship between the style of an APL program and its expected benefit from the automatic parallel execution provided by the compiler.", acknowledgement = ack-nhfb, affiliation = "IBM Thomas J. Watson Res. Center, Yorktown Heights, NY, USA", classcodes = "C6150C (Compilers, interpreters and other processors); C6150N (Distributed systems); C6140D (High level languages)", classification = "C6140D (High level languages); C6150C (Compilers, interpreters and other processors); C6150N (Distributed systems)", corpsource = "IBM Thomas J. Watson Res. Center, Yorktown Heights, NY, USA", fjournal = "IBM Journal of Research and Development", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5288520", keywords = "APL; APL/370 compiler; APL/C; APL/C compiler; Automatically parallelized APL programs; automatically parallelized APL programs; C language; C programs; compiler; compilers; Data parallelism; data parallelism; Mach operating; Mach operating system; Mach threads; multiprocessing programs; program; RP3; Shared-memory; shared-memory; synchronisation; Synchronization primitives; synchronization primitives; system", thesaurus = "APL; C language; Multiprocessing programs; Program compilers; Synchronisation", treatment = "P Practical", } @Article{Chiueh:1991:MTV, author = "Tzi-cker Chiueh", title = "Multi-threaded vectorization", journal = j-COMP-ARCH-NEWS, volume = "19", number = "3", pages = "352--361", month = may, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Culler:1991:FGPa, author = "David E. Culler and Anurag Sah and Klaus E. Schauser and Thorsten von Eicken and John Wawrzynek", title = "Fine-grain parallelism with minimal hardware support: a compiler-controlled threaded abstract machine", journal = j-COMP-ARCH-NEWS, volume = "19", number = "2", pages = "164--175", month = apr, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Culler:1991:FGPb, author = "David E. Culler and Anurag Sah and Klaus E. Schauser and Thorsten von Eicken and John Wawrzynek", title = "Fine-Grain Parallelism with Minimal Hardware Support: a Compiler-Controlled Threaded Abstract Machine", journal = j-SIGPLAN, volume = "26", number = "4", pages = "164--175", month = apr, year = "1991", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sat May 01 18:50:04 1999", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Culler:1991:FGPc, author = "David E. Culler and Anurag Sah and Klaus E. Schauser and Thorsten von Eicken and John Wawrzynek", title = "Fine-grain parallelism with minimal hardware support: a compiler-controlled threaded abstract machine", journal = j-OPER-SYS-REV, volume = "25", number = "3S", pages = "164--175", month = apr, year = "1991", CODEN = "OSRED8", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 15:24:15 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @Article{Draves:1991:UCI, author = "Richard P. Draves and Brian N. Bershad and Richard F. Rashid and Randall W. Dean", title = "Using continuations to implement thread management and communication in operating systems", journal = j-OPER-SYS-REV, volume = "25", number = "5", pages = "122--136", month = oct, year = "1991", CODEN = "OSRED8", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @InProceedings{Faulkner:1991:PFS, author = "Roger Faulkner and Ron Gomes", title = "The Process File System and Process Model in {UNIX System V}", crossref = "USENIX:1991:PWU", pages = "243--252", year = "1991", bibdate = "Mon Jan 02 08:29:13 2017", bibsource = "ftp://ftp.uu.net/library/bibliography; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/unix.bib; https://www.math.utah.edu/pub/tex/bib/usenix1990.bib", URL = "http://obits.mlive.com/obituaries/grandrapids/obituary.aspx?pid=180588279; http://thenewstack.io/remembering-roger-faulkner/; https://www.usenix.org/memoriam-roger-faulkner; https://www.usenix.org/sites/default/files/usenix_winter91_faulkner.pdf", abstract = "We describe the process file system {\bf /proc} in UNIX System V Release 4 and its relationship to the UNIX process model abstraction. {\bf /proc} began as a debugger interface superseding {\em ptrace(2)\/} but has evolved into a general interface to the process model. It provides detailed process information and control mechanisms that are independent of operating system implementation details and portable to a large class of real architectures. Control is thorough. Processes can be stopped and started on demand and can be instructed to stop on events of interest: specific machine faults, specific signals, and entry to or exit from specific system calls. Complete encapsulation of a process's execution environment is possible, as well as non-intrusive inspection. Breakpoint debugging is relieved from the ambiguities of signals. Security provisions are complete and non-destructive.\par The addition of multi-threading to the process model motivates a proposal for a substantial change to the {\bf /proc} interface that would replace the single-level flat structure with a hierarchy of directories containing status and control files. This restructuring would eliminate all {\em ioctl(2)\/} operations in favor of {\em read(2)\/} and {\em write(2)\/} operations, which generalize more easily to networks .", acknowledgement = ack-nhfb, author-dates = "Roger Faulkner (8 April 1940--2 July 2016)", } @Article{Gallmeister:1991:EEP, author = "Bill O. Gallmeister and Chris Lanier", title = "Early experience with {POSIX 1003.4} and {POSIX 1003.4 A}", journal = j-PROC-REAL-TIME-SYS-SYMP, pages = "190--198 (of ix + 307)", year = "1991", CODEN = "PRSYEA", ISBN = "0-8186-2450-7", ISBN-13 = "978-0-8186-2450-6", LCCN = "QA 76.54 R43 1991", bibdate = "Mon Dec 22 09:06:02 1997", bibsource = "Compendex database; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "IEEE catalog number 91CH3090-8.", abstract = "Two proposed IEEE standards for real-time operating systems support, POSIX.4 and POSIX.4a, are proceeding towards IEEE approval and will eventually become international standards. The authors provide a brief overview of the facilities of POSIX.4 and POSIX.4a. They concentrate on a few of the critical features that POSIX.4 and POSIX.4a provide and describe the POSIX.4 scheduling interface. The POSIX.4a support for multiple threads of control is also described. The features found in POSIX.4 and POSIX.4a for synchronization of multiple threads, are discussed, and the POSIX.4 interprocess communication facility is presented. The performance numbers are given to allow comparisons of the facilities of traditional UNIX systems, the facilities of a representative hard real-time system (LynxOS), and the facilities of POSIX.4 and POSIX.4a.", acknowledgement = ack-nhfb, classification = "722; 723; 902", conference = "Proceedings of the 12th Real-Time Systems Symposium", conferenceyear = "1991", fjournal = "Proceedings --- Real-Time Systems Symposium", journalabr = "Proc Real Time Syst Symp", keywords = "Computer Operating Systems--Standards; Computer Systems, Digital; POSIX.4a Standards; Real Time Operation; Real-Time Operating Systems", meetingaddress = "San Antonio, TX, USA", meetingdate = "Dec 4--6 1991", meetingdate2 = "12/04--06/91", publisherinfo = "IEEE Service Center", sponsor = "IEEE Computer Soc", } @TechReport{Glenn:1991:CMH, author = "Ray R. Glenn", title = "Characterizing memory hot spots in a shared memory {MIMD} machine", type = "Technical report", number = "SRC-TR-91-039", institution = inst-SRC-IDA, address = inst-SRC-IDA:adr, pages = "24", day = "15", month = oct, year = "1991", bibdate = "Fri Aug 30 08:01:51 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This paper analyzes two memory hot spot problems associated with massively parallel MIMD computers. The first is the memory stride problem, which is similar to stride problems found in existing supercomputers. The second hot spot problem occurs in designs that use two separate memory accesses to lock and unlock critical sections (split transaction) and employ a first come/first serve queuing mechanism for shared memory locations. A bistability in throughput brought about by these conditions is analyzed and experimentally demonstrated. Simple equations are presented which predict the throughput at a critical section of code as a function of the number of applied threads. In particular, the mean size of the work items that can be executed in parallel without the possibility of stalling is proportional to the square of the number of threads applied.", acknowledgement = ack-nhfb, keywords = "Multiprocessors", } @InProceedings{Hirata:1991:MPA, author = "H. Hirata and Y. Mochizuki and A. Nishimura and Y. Nakase", title = "A Multithreaded Processor Architecture with Simultaneous Instruction Issuing", crossref = "Anonymous:1991:PIS", pages = "87--96", year = "1991", bibdate = "Mon Aug 26 10:38:41 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @InProceedings{Hironaka:1991:SVP, author = "T. Hironaka and T. Hashimoto and K. Okazaki and K. Murakami", title = "A Single-Chip Vector-Processor Prototype Based on Multithreaded Streaming\slash {FIFO} ({MSFV}) Architecture", crossref = "Anonymous:1991:PIS", pages = "77--86", year = "1991", bibdate = "Mon Aug 26 10:38:41 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Horiguchi:1991:PEP, author = "Susumu Horiguchi and Takeo Nakada", title = "Performance Evaluation of Parallel Fast {Fourier} Transform on a Multiprocessor Workstation", journal = j-J-PAR-DIST-COMP, volume = "12", number = "2", pages = "158--163", month = jun, year = "1991", CODEN = "JPDCER", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Sat Apr 12 17:13:17 MDT 1997", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classification = "C4190 (Other numerical methods); C4240 (Programming and algorithm theory); C5440 (Multiprocessor systems and techniques)", corpsource = "Dept. of Inf. Sci., Tohoku Univ., Sendai, Japan", fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315", keywords = "algorithms; cache protocols; fast Fourier transform; fast Fourier transforms; FFT; floating-; multiprocess operating system; multiprocessing systems; multiprocessor workstation; multithread operating system; operating systems; parallel; parallel FFT; performance; performance evaluation; point coprocessors", treatment = "P Practical", } @Article{Hum:1991:NHS, author = "H. H. J. Hum and G. R. Gao", title = "A Novel High-Speed Memory Organization for Fine-Grain Multi-Thread Computing", journal = j-LECT-NOTES-COMP-SCI, volume = "505", pages = "34--??", year = "1991", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Mon May 13 08:51:55 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Jolitz:1991:PUB, author = "W. F. Jolitz and L. G. Jolitz", title = "Porting {UNIX} to the 386. {The} basic kernel Multiprogramming and multitasking. {II}", journal = j-DDJ, volume = "16", number = "10", pages = "62, 64, 66, 68, 70, 72, 118--120", month = oct, year = "1991", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Tue Sep 10 09:11:02 MDT 1996", bibsource = "http://www.ddj.com/index/author/index.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classification = "C6110 (Systems analysis and programming); C6150J (Operating systems)", fjournal = "Dr. Dobb's Journal of Software Tools", keywords = "386BSD kernel; Multiple simultaneous process execution; Multiprogramming; Multitasking; Multithread operations; Operating systems; Porting; Sleep( ); Swch( ); Switching mechanisms; UNIX; Wakeup( )", thesaurus = "C listings; Microprocessor chips; Multiprogramming; Software portability; Unix", } @InProceedings{Jones:1991:BCL, author = "Michael B. Jones", title = "Bringing the {C} Libraries with Us into a Multi-Threaded Future", crossref = "USENIX:1991:PWU", pages = "81--92", day = "21--25", month = jan, year = "1991", bibdate = "Fri Oct 18 07:24:24 MDT 1996", bibsource = "ftp://ftp.uu.net/library/bibliography; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, affiliation = "Carnegie Mellon University", } @InProceedings{Kuchlin:1991:MCI, author = "Wolfgang K{\"u}chlin", title = "On the multi-threaded computation of integral polynomial greatest common divisors", crossref = "Watt:1991:IPI", pages = "333--342", year = "1991", bibdate = "Thu Mar 12 08:38:03 MST 1998", bibsource = "http://www.acm.org/pubs/toc/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/proceedings/issac/120694/p333-kuchlin/", abstract = "Reports experiences and practical results from parallelizing the Brown--Collins polynomial g.c.d. algorithm, starting from Collins' SAC-2 implementation IPGCDC. The parallelization environment is PARSAC-2, a multi-threaded version of SAC-2 programmed in C with the parallelization constructs of the C Threads library. IPGCDC computes the g.c.d. and its co-factors of two polynomials in $ Z(x_1, \ldots {}, x_r) $, by first reducing the problem to multiple calculations of modular polynomial g.c.d.'s in $ Z_p(x_1, \ldots {}, x_r) $, and then recovering the result by Chinese remaindering. After studying timings of the SAC-2 algorithm, the author first parallelizes the Chinese remainder algorithm, and then parallelizes the main loop of IPGCDC by executing the modular g.c.d. computations concurrently. Finally, he determines speed-up's and speed-up efficiencies of our parallel algorithms over a wide range of polynomials. The experiments were conducted on a 12 processor Encore Multimax under Mach.", acknowledgement = ack-nhfb, affiliation = "Dept. of Comput. and Inf. Sci., Ohio State Univ., Columbus, OH, USA", classification = "C4240 (Programming and algorithm theory); C7310 (Mathematics)", keywords = "algorithms; Brown--Collins polynomial g.c.d. algorithm; Chinese remaindering; Encore Multimax; Multi-threaded computation; PARSAC-2; Polynomial greatest common divisors", subject = "{\bf G.1.0} Mathematics of Computing, NUMERICAL ANALYSIS, General, Parallel algorithms. {\bf F.2.1} Theory of Computation, ANALYSIS OF ALGORITHMS AND PROBLEM COMPLEXITY, Numerical Algorithms and Problems, Computations on polynomials. {\bf I.1.0} Computing Methodologies, SYMBOLIC AND ALGEBRAIC MANIPULATION, General. {\bf I.1.3} Computing Methodologies, SYMBOLIC AND ALGEBRAIC MANIPULATION, Languages and Systems. {\bf D.3.2} Software, PROGRAMMING LANGUAGES, Language Classifications, C.", thesaurus = "Mathematics computing; Parallel algorithms; Symbol manipulation", } @InProceedings{Malan:1991:MA, author = "G. Malan and R. Rashid and D. Golub and R. Baron", title = "{DOS} as a {Mach 3.0} application", crossref = "USENIX:1991:PUM", pages = "27--40", year = "1991", bibdate = "Sat Sep 28 19:47:51 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/mach.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", acknowledgement = ack-nhfb, affiliation = "Sch. of Comput. Sci., Carnegie Mellon Univ., Pittsburgh, PA, USA", classification = "C6150J (Operating systems); C7430 (Computer engineering)", keywords = "Common DOS functions; Common DOS software; DOS functionality; DOS operating system; Frequently loaded DOS drivers; High memory area; High-speed space combat simulation system; I/O devices; I386/i486 architecture; Latency demands; Mach features; Machine-dependent kernel modifications; Multiple virtual DOS environments; Multithreaded emulation; PC architecture; Performance sensitive PC entertainment software; Timing; Unix emulation; Unix Server; VGA display; Virtual 8086 mode; Virtual machine environment; Wing Commander", thesaurus = "IBM computers; Microcomputer applications; Supervisory programs; Unix; Virtual machines", } @Article{Man:1991:MLC, author = "Richard F. Man", title = "A Multithreading Library In {C} For Subsumption Architecture", journal = j-CUJ, volume = "9", number = "11", pages = "42--??", month = nov, year = "1991", ISSN = "0898-9788", bibdate = "Fri Aug 30 16:52:23 MDT 1996", bibsource = "http://www.cuj.com/cbklist.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "C Users Journal", } @Article{Marsh:1991:FCU, author = "Brian D. Marsh and Michael L. Scott and Thomas J. LeBlanc and Evangelos P. Markatos", title = "First-class user-level threads", journal = j-OPER-SYS-REV, volume = "25", number = "5", pages = "110--121", month = oct, year = "1991", CODEN = "OSRED8", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:57 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @PhdThesis{Mennemeier:1991:HMS, author = "Lawrence Mennemeier", title = "Hardware mechanisms to support concurrent threads on {RISC} and superscalar multiprocessors", type = "Thesis ({M.S.})", school = "University of California, Santa Cru", pages = "vii + 39", year = "1991", LCCN = "QA76.5.M44 1991", bibdate = "Fri May 10 12:18:17 MDT 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Masters theses -- University of California, Santa Cruz -- 1991; multiprocessors; parallel processing (electronic computers)", } @Article{Papadopoulos:1991:MRV, author = "Gregory M. Papadopoulos and Kenneth R. Traub", title = "Multithreading: a revisionist view of dataflow architectures", journal = j-COMP-ARCH-NEWS, volume = "19", number = "3", pages = "342--351", month = may, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @PhdThesis{Park:1991:PTM, author = "Won Woo Park", title = "Performance-area trade-offs in multithreaded processing unit", type = "Thesis ({Ph.D.})", school = "University of Texas at Austin", address = "Austin, TX, USA", pages = "xvii + 165", year = "1991", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Computer architecture; Multiprocessors; Parallel processing (Electronic computers)", } @MastersThesis{Pham:1991:EMD, author = "Thuan Quang Pham", title = "The experimental migration of a distributed application to a multithreaded environment", type = "Thesis ({M.S.})", school = "Massachusetts Institute of Technology, Department of Electrical Engineering and Computer Science", address = "Cambridge, MA, USA", pages = "51", year = "1991", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Ponamgi:1991:DMP, author = "M. Krish Ponamgi and Wenwey Hseush and Gail E. Kaiser", title = "Debugging Multithreaded Programs with {MPD}", journal = j-IEEE-SOFTWARE, volume = "8", number = "3", pages = "37--43", month = may, year = "1991", CODEN = "IESOEG", ISSN = "0740-7459 (print), 0740-7459 (electronic)", ISSN-L = "0740-7459", bibdate = "Sat Jan 25 07:35:26 MST 1997", bibsource = "Compendex database; https://www.math.utah.edu/pub/tex/bib/ieeesoft.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; Misc/IMMD_IV.bib; Parallel/debug_3.1.bib", acknowledgement = ack-nhfb, affiliation = "Dept of Comput Sci, Columbia Univ, New York, NY, USA", classification = "723", fjournal = "IEEE Software", journal-URL = "http://www.computer.org/portal/web/csdl/magazines/software", journalabr = "IEEE Software", keywords = "Computer Programming; Computer Systems, Digital --- Multiprocessing; Event Recognition; Multiprocessor Debugger; Multithreaded Software; Pattern Recognition; Program Debugging", } @InProceedings{Powell:1991:SMT, author = "M. L. Powell and S. R. Kleiman and S. Barton and D. Shah and D. Stein and M. Weeks", title = "{SunOS} Multi-thread Architecture", crossref = "USENIX:1991:PWU", institution = "Sun Microsystems, Inc.", pages = "65--80", day = "21--25", month = jan, year = "1991", bibdate = "Wed Aug 13 10:48:45 MDT 1997", bibsource = "ftp://ftp.uu.net/library/bibliography; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.usenix.org/cgi-bin/sortbib.pl?-sA", acknowledgement = ack-nhfb, affiliation = "Sun Microsystems, Inc.", } @Article{Richman:1991:EHC, author = "Scott Richman", title = "Examining the {Hamilton C} shell ({Unix} power for {OS/2})", journal = j-DDJ, volume = "16", number = "1", pages = "98, 100, 102, 104--106", month = jan, year = "1991", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Tue Sep 10 09:11:02 MDT 1996", bibsource = "http://www.ddj.com/index/author/index.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; UnCover database", abstract = "Doug Hamilton's C Shell helps you create more powerful OS/2 programs.", acknowledgement = ack-nhfb, classification = "C6115 (Programming support); C6150E (General utility programs); C6150J (Operating systems)", fjournal = "Dr. Dobb's Journal of Software Tools", keywords = "C shell environment; C++ programs; High-performance file system; Large command lines; Long filenames; OS/2 features; Pipes; Presentation Manager; Script language; Script program; Shell scripts; Text windows; Threads; Utilities", thesaurus = "C listings; Software packages; Software tools; Utility programs", } @TechReport{Saavedra-Barrera:1991:ASM, author = "Rafael H. Saavedra-Barrera and David E. Culler", title = "An analytical solution for a {Markov} chain modeling multithreaded execution", type = "Report", number = "UCB/CSD 91/623", institution = "University of California, Berkeley, Computer Science Division", address = "Berkeley, CA, USA", pages = "24", month = apr, year = "1991", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Multithreading is an architectural technique aimed at maintaining high processor utilization in the presence of large memory or interprocessor communication latency. While waiting for a remote reference to complete, the processor switches to another execution thread. Several realizations of this concept have been proposed, but little data is available on the actual costs and benefits. This paper presents an analytical model of multithreaded execution, which may serve to guide and explain empirical studies. The model is based on three key parameters: thread run-length, switch cost, and latency. A closed-form expression for processor utilization is obtained for deterministic and stochastic run-lengths. The derivation involves identifying specific patterns in the very large set of equations forming the Markov chain. Using this result, three operating regimes are identified for a multithreaded processor subject to long latencies: linear, where utilization is proportional to the number of threads per processor, saturation, where utilization is determined only by the run-length and switch cost, and transition between the other regimes. The model can be used to estimate the effects of several architectural variations.", acknowledgement = ack-nhfb, annote = "Supported in part by NASA under consortium agreement NCA2-128 and cooperative agreement NCC2-550. Supported in part by the National Science Foundation.", keywords = "Computer architecture; Markov chains", } @Article{Schauser:1991:CCM, author = "Klaus Erik Schauser and David E. Culler and Thorsten {von Eicken}", title = "Compiler-Controlled Multithreading for Lenient Parallel Languages", journal = j-LECT-NOTES-COMP-SCI, volume = "523", pages = "50--??", year = "1991", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Mon May 13 08:51:55 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @MastersThesis{Schauser:1991:CDT, author = "Klaus Erik Schauser", title = "Compiling dataflow into threads: efficient compiler-controlled multithreading for lenient parallel languages", type = "Thesis ({M.S.})", school = "University of California, Berkeley, Computer Science Division", address = "Berkeley, CA, USA", pages = "71", day = "2", month = jul, year = "1991", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "Also available as Report UCB/CSD 91/644", abstract = "Powerful non-strict parallel languages require fast dynamic scheduling. This thesis explores how the need for multithreaded execution can be addressed as a compilation problem, to achieve switching rates approaching what hardware mechanisms might provide. Compiler-controlled multithreading is examined through compilation of a lenient parallel language, ID90, for a threaded abstract machine, TAM. A key feature of TAM is that synchronization is explicit and occurs only at the start of a thread, so that a simple cost model can be applied. A scheduling hierarchy allows the compiler to schedule logically related threads closely together in time and to use registers across threads. Remote communication is via message sends and split-phase memory accesses. Messages and memory replies are received by compiler-generated message handlers which rapidly integrate these events with thread scheduling. To compile ID90 for TAM, we employ a new parallel intermediate form, dual-graphs, with distinct control and data arcs. This provides a clean framework for partitioning the program into threads, scheduling threads, and managing registers under asynchronous execution. The compilation process is described and preliminary measurements of the effectiveness of the approach are discussed. Previous to this work, execution of Id90 programs was limited to specialized architectures or dataflow graph interpreters. By compiling via TAM, we have achieved more than two orders of magnitude performance improvement over graph interpreters on conventional machines, making this Id90 implementation competitive with machines supporting dynamic instruction scheduling in hardware. Timing measurements show that our Id90 implementation on a standard RISC can achieve a performance close to Id90 on one processor of the recent dataflow machine Monsoon. It can be seen that the TAM partitioning presented in this thesis reduces the control overhead substantially and that more aggressive partitioning would yield modest additional benefit. There is, however, considerable room for improvement in scheduling and register management.", acknowledgement = ack-nhfb, annote = "Supported in part by the National Science Foundation. Supported in part by Motorola Inc., the TRW Foundation, and the International Computer Science Institute", keywords = "Compilers (Computer programs); Parallel programming (Computer science)", } @TechReport{Schauser:1991:CML, author = "Klaus Erik Schauser and David E. Culler and Thorsten {von Eicken}", title = "Compiler-controlled multithreading for lenient parallel languages", type = "Report", number = "UCB/CSD 91/640", institution = "University of California, Berkeley, Computer Science Division", address = "Berkeley, CA, USA", pages = "21", day = "30", month = jul, year = "1991", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "A version of this report is to appear in the Proceedings of FPCA '91 Conference on Functional Programming Languages and Computer Architecture, Aug. 1991, Springer-Verlag", abstract = "Tolerance to communication latency and inexpensive synchronization are critical for general-purpose computing on large multiprocessors. Fast dynamic scheduling is required for powerful nonstrict parallel languages. However, machines that support rapid switching between multiple execution threads remain a design challenge. This paper explores how multithreaded execution can be addressed as a compilation problem, to achieve switching rates approaching what hardware mechanisms might provide. Compiler-controlled multithreading is examined through compilation of a lenient parallel language, Id90, for a threaded abstract machine, TAM. A key feature of TAM is that synchronization is explicit and occurs only at the start of a thread, so that a simple cost model can be applied. A scheduling hierarchy allows the compiler to schedule logically related threads closely together in time and to use registers across threads. Remote communication is via message sends and split-phase memory accesses. Messages and memory replies are received [sic] by compiler-generated message handlers which rapidly integrate these events with thread scheduling. To compile Id90 for TAM, we employ a new parallel intermediate form, dual-graphs, with distinct control and data arcs. This provides a clean framework for partitioning the program into threads, scheduling threads, and managing registers under asynchronous execution. The compilation process is described and preliminary measurements of its effectiveness are discussed. Dynamic execution measurements are obtained via a second compilation step, which translates TAM into native code for existing machines with instrumentation incorporated. These measurements show that the cost of compiler-controlled multithreading is within a small factor of the cost of control flow in sequential languages.", acknowledgement = ack-nhfb, annote = "Supported in part by the National Science Foundation PYI Award. Supported in part by Motorola Inc., the TRW Foundation and the Semiconductor Research Corporation Supported in part by J. Wawrzynek's PYI Award. Supported in part by NSF Infrastructure Grant.", keywords = "Compilers (Computer programs); Parallel programming (Computer science)", } @Article{Schwan:1991:RTT, author = "Karsten Schwan and Hongyi Zhou and Ahmed Gheith", title = "Real-time threads", journal = j-OPER-SYS-REV, volume = "25", number = "4", pages = "35--46", month = oct, year = "1991", CODEN = "OSRED8", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @Article{Speer:1991:DTP, author = "Thomas G. Speer and Mark W. Storm", title = "{Digital}'s Transaction Processing Monitors", journal = j-DEC-TECH-J, volume = "3", number = "1", pages = "18--32", month = "Winter", year = "1991", CODEN = "DTJOEL", ISSN = "0898-901X", bibdate = "Thu Mar 20 18:15:43 MST 1997", bibsource = "/usr/local/src/bib/bibliography/Database/Graefe.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "ftp://ftp.digital.com/pub/Digital/info/DTJ/v3n1/Digitals_Transaction_Processi_01oct1991DTJ102P8.ps; http://www.digital.com:80/info/DTJ102/DTJ102SC.TXT", abstract = "Digital provides two transaction processing (TP) monitor products --- ACMS (Application Control and Management System) and DECintact (Integrated Application Control). Each monitor is a unified set of transaction processing services for the application environment. These services are layered on the VMS operating system. Although there is a large functional overlap between the two, both products achieve similar goals by means of some significantly different implementation strategies. Flow control and multithreading in the ACMS monitor is managed by means of a fourth-generation language (4GL) task definition language. Flow control and multithreading in the DECintact monitor is managed at the application level by third-generation language (3GL) calls to a library of services. The ACMS monitor supports a deferred task model of queuing, and the DECintact monitor supports a message-based model. Over time, the persistent distinguishing feature between the two monitors will be their different application programming inter faces.", acknowledgement = ack-nhfb, affiliation = "Digital Equipment Corp., Maynard, MA, USA", classcodes = "C6150J (Operating systems)", classification = "C6150J (Operating systems)", corpsource = "Digital Equipment Corp., Maynard, MA, USA", fjournal = "Digital Technical Journal", keywords = "ACMS; Application; Application Control; Application Control and Management System; Application programming interfaces; application programming interfaces; Control and Management System; DECintact; Digital; Integrated; Integrated Application Control; message-based model; Message-based model; monitors; Monitors; Multithreading; multithreading; Queuing; queuing; supervisory programs; task definition language; Task definition language; transaction processing; Transaction processing; transaction processing; VMS operating system", thesaurus = "Supervisory programs; Transaction processing", treatment = "P Practical", } @Article{Traub:1991:MTC, author = "Kenneth R. Traub", title = "Multi-thread Code Generation for Dataflow Architectures from Non-Strict Programs", journal = j-LECT-NOTES-COMP-SCI, volume = "523", pages = "73--??", year = "1991", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Mon May 13 08:51:55 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Agarwal:1992:PTM, author = "Anant Agarwal", title = "Performance tradeoffs in multithreaded processors", journal = j-IEEE-TRANS-PAR-DIST-SYS, volume = "3", number = "5", pages = "525--539", month = sep, year = "1992", CODEN = "ITDSEO", DOI = "https://doi.org/10.1109/71.159037", ISSN = "1045-9219 (print), 1558-2183 (electronic)", ISSN-L = "1045-9219", bibdate = "Fri Apr 11 15:20:39 MDT 1997", bibsource = "Compendex database; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, affiliation = "Lab for Comput Sci, MIT, Cambridge, MA, USA", classification = "722.1; 722.4; C4230M (Multiprocessor interconnection); C4240P (Parallel programming and algorithm theory); C5220P (Parallel architecture); C5320G (Semiconductor storage); C5440 (Multiprocessor systems and techniques); C5470 (Performance evaluation and testing); C6120 (File organisation)", corpsource = "Lab. for Comput. Sci., MIT, Cambridge, MA, USA", fjournal = "IEEE Transactions on Parallel and Distributed Systems", journal-URL = "http://www.computer.org/tpds/archives.htm", journalabr = "IEEE Trans Parallel Distrib Syst", keywords = "buffer storage; cache interference; Cache memories; caches; contention; context-switching overhead; data-sharing; Digital storage; interconnection networks; Interconnection networks; multiprocessing systems; multiprocessor; multithreaded processors; network; network bandwidth; parallel; parallel algorithms; Parallel processing systems; Performance; Performance analysis; performance evaluation; Pipeline processing systems; programming; storage management; switching theory", treatment = "P Practical; T Theoretical or Mathematical", } @InProceedings{Alverson:1992:EHP, author = "G. A. Alverson and R. Alverson and D. Callahan and B. Koblenz", title = "Exploiting Heterogeneous Parallelism on a Multi-threaded Multiprocessor", crossref = "ACM:1992:CPI", pages = "188--197", year = "1992", bibdate = "Mon Aug 26 10:38:41 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Anderson:1992:SAE, author = "Thomas E. Anderson and Brian N. Bershad and Edward D. Lazowska and Henry M. Levy", title = "Scheduler Activations: Effective Kernel Support for the User-Level Management of Parallelism", journal = j-TOCS, volume = "10", number = "1", pages = "53--79", month = feb, year = "1992", CODEN = "ACSYEC", ISSN = "0734-2071 (print), 1557-7333 (electronic)", ISSN-L = "0734-2071", bibdate = "Wed Jan 13 18:36:53 MST 1999", bibsource = "http://www.acm.org/pubs/contents/journals/tocs/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/journals/tocs/1992-10-1/p53-anderson/", abstract = "{\em Threads\/} are the vehicle for concurrency in many approaches to parallel programming. Threads can be supported either by the operating system kernel or by user-level library code in the application address space, but neither approach has been fully satisfactory. This paper addresses this dilemma. First, we argue that the performance of kernel threads is {\em inherently\/} worse than that of user-level threads, rather than this being an artifact of existing implementations; managing parallelism at the user level is essential to high-performance parallel computing. Next, we argue that the problems encountered in integrating user-level threads with other system services is a consequence of the lack of kernel support for user-level threads provided by contemporary multiprocessor operating systems; kernel threads are the {\em wrong abstraction\/} on which to support user-level management of parallelism. Finally, we describe the design, implementation, and performance of a new kernel interface and user-level thread package that together provide the same functionality as kernel threads without compromising the performance and flexibility advantages of user-level management of parallelism.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Computer Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J774", keywords = "design; measurement; performance", subject = "{\bf D.4.1} Software, OPERATING SYSTEMS, Process Management, Scheduling. {\bf D.4.4} Software, OPERATING SYSTEMS, Communications Management, Input/output. {\bf D.4.7} Software, OPERATING SYSTEMS, Organization and Design. {\bf D.4.8} Software, OPERATING SYSTEMS, Performance.", } @Article{Anonymous:1992:MWPa, author = "Anonymous", title = "It's a Multithreaded World, Part 1: Multithreaded operating systems are becoming the norm. {Here}'s how your applications can exploit them", journal = j-BYTE, volume = "17", number = "5", pages = "289--??", month = may, year = "1992", CODEN = "BYTEDJ", ISSN = "0360-5280 (print), 1082-7838 (electronic)", ISSN-L = "0360-5280", bibdate = "Tue Jan 2 10:01:41 MST 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "BYTE Magazine", } @Article{Anonymous:1992:MWPb, author = "Anonymous", title = "It's a Multithreaded World, Part 2: Multithreaded operating systems are taking over. {Are} your applications ready?", journal = j-BYTE, volume = "17", number = "6", pages = "351--??", month = jun, year = "1992", CODEN = "BYTEDJ", ISSN = "0360-5280 (print), 1082-7838 (electronic)", ISSN-L = "0360-5280", bibdate = "Tue Jan 2 10:01:41 MST 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "BYTE Magazine", } @MastersThesis{Arunachalam:1992:EMM, author = "Prakash Arunachalam", title = "Evaluation of a multithreaded microprocessor with {MIPS R3000} instruction set", type = "Thesis ({M.S. in Engineering})", school = "University of Texas at Austin", address = "Austin, TX, USA", pages = "vii + 45", year = "1992", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Computer architecture; MIPS R3000 series microprocessors; Parallel processing (Electronic computers); Reduced instruction set computers; RISC microprocessors", } @Article{Bauer:1992:PCE, author = "Barr E. Bauer", title = "Parallel {C} extensions", journal = j-DDJ, volume = "17", number = "8", pages = "110, 112--114, 124, 127", month = aug, year = "1992", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Tue Sep 10 10:06:23 MDT 1996", bibsource = "http://www.ddj.com/index/author/index.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, affiliation = "Schering-Plough Res. Inst., Bloomfield, NJ, USA", classification = "C6110P (Parallel programming); C6140D (High level languages); C6150C (Compilers, interpreters and other processors)", fjournal = "Dr. Dobb's Journal of Software Tools", keywords = "C extensions; C programs; Parallel execution regions; Parallel execution threads; Parallelized program; Serial program; Silicon Graphics IRIS Power C compiler", thesaurus = "C language; C listings; Parallel languages; Program compilers", } @Article{Bershad:1992:FME, author = "Brian N. Bershad and David D. Redell and John R. Ellis", title = "Fast mutual exclusion for uniprocessors", journal = j-SIGPLAN, volume = "27", number = "9", pages = "223--233", month = sep, year = "1992", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:16:26 MST 2003", bibsource = "http://portal.acm.org/; http://www.acm.org/pubs/toc/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/proceedings/asplos/143365/p223-bershad/", abstract = "In this paper we describe restartable atomic sequences, an {\em optimistic\/} mechanism for implementing simple atomic operations (such as {\em Test-And-Set\/}) on a uniprocessor. A thread that is suspended within a restartable atomic sequence is resumed by the operating system at the beginning of the sequence, rather than at the point of suspension. This guarantees that the thread eventually executes the sequence {\em atomically\/}. A restartable atomic sequence has significantly less overhead than other software-based synchronization mechanisms, such as kernel emulation or software reservation. Consequently, it is an attractive alternative for use on uniprocessors that do no support atomic operations. Even on processors that do support atomic operations in hardware, restartable atomic sequences can have lower overhead. We describe different implementations of restartable atomic sequences for the Mach 3.0 and Taos operating systems. These systems' thread management packages rely on atomic operations to implement higher-level mutual exclusion facilities. We show that improving the performance of low-level atomic operations, and therefore mutual exclusion mechanisms, improves application performance.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "design; languages; measurement; performance", subject = "{\bf D.4.1} Software, OPERATING SYSTEMS, Process Management, Mutual exclusion.", } @MastersThesis{Blumofe:1992:MSM, author = "Robert D. (Robert David) Blumofe", title = "Managing storage for multithreaded computations", type = "Thesis ({M.S.})", school = "Massachusetts Institute of Technology, Laboratory for Computer Science, Department of Electrical Engineering and Computer Science", address = "Cambridge, MA, USA", pages = "83", year = "1992", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "Also available as Report MIT/LCS/TR 552.", acknowledgement = ack-nhfb, } @Article{Boothe:1992:IMT, author = "Bob Boothe and Abhiram Ranade", title = "Improved multithreading techniques for hiding communication latency in multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "214--223", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Cattaneo:1992:ACT, author = "G. Cattaneo and G. Di Giore and M. Ruotolo", title = "Another {C} Threads Library", journal = j-SIGPLAN, volume = "27", number = "12", pages = "81--90", month = dec, year = "1992", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:16:30 MST 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @MastersThesis{Chowdhury:1992:PEA, author = "Indranil Chowdhury", title = "Performance evaluation and architecture of an instruction cache for multithreaded {RISC} processor", type = "Thesis ({M.S. in Engineering})", school = "University of Texas at Austin", address = "Austin, TX, USA", pages = "x + 93", year = "1992", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Cache memory -- Evaluation -- Simulation methods; Computer architecture; Microprocessors; Reduced instruction set computers", } @TechReport{Culler:1992:AMMa, author = "David E. Culler and Michial Gunter and James C. Lee", title = "Analysis of multithreaded microprocessors under multiprogramming", type = "Report", number = "UCB/CSD 92/687", institution = "University of California, Berkeley, Computer Science Division", address = "Berkeley, CA, USA", pages = "17", month = may, year = "1992", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Multithreading has been proposed as a means of tolerating long memory latencies in multiprocessor systems. Fundamentally, it allows multiple concurrent subsystems (cpu, network, and memory) to be utilized simultaneously. This is advantageous on uniprocessor systems as well, since the processor is utilized while the memory system services misses. We examine multithreading on high-performance uniprocessors as a means of achieving better cost/performance on multiple processes. Processor utilization and cache behavior are studied both analytically and through simulation of timesharing and multithreading using interleaved reference traces. Multithreading is advantageous when one has large on-chip caches (32 kilobytes), associativity of two, and a memory access cost of roughly 50 instruction times. At this point, a small number of threads (2-4) is sufficient, the thread switch need not be extraordinarily fast, and the memory system need support only one or two outstanding misses. The increase in processor real-estate to support multithreading is modest, given the size of the cache and floating-point units. A surprising observation is that miss ratios may be lower with multithreading than with timesharing under a steady-state load. This occurs because switch-on-miss multithreading introduces unfair thread scheduling, giving more CPU cycles to processes with better cache behavior.", acknowledgement = ack-nhfb, annote = "Supported in part by the National Science Foundation. Supported in part by Motorola Inc. and the TRW Foundation", keywords = "Microprocessors; Multiprogramming (Electronic computers)", } @Article{Culler:1992:AMMb, author = "David E. Culler and Michial Gunter and James C. Lee", title = "Analysis of multithreaded microprocessors under multiprogramming", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "438--438", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Day:1992:INB, author = "Michael Day", title = "Implementing {NLM-Based} Client\slash Server Architectures", journal = j-DDJ, volume = "17", number = "10", pages = "78--84", month = oct, year = "1992", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Tue Sep 03 09:15:34 1996", bibsource = "http://www.ddj.com/index/author/index.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; UnCover database", abstract = "NetWare NLMs take full advantage of the multitasking, multithreaded architecture of the operating system. Michael presents a distributed file manager made up of two modules: ENGINE.NLM, an NLM running on a NetWare 3.x server, and CLIENT.EXE, a DOS-based front end running on the client.", acknowledgement = ack-nhfb, classification = "C6150N (Distributed systems)", fjournal = "Dr. Dobb's Journal of Software Tools", keywords = "32-Bit protected-mode programs; Client/server architectures; Distributed file manager; DOS-based front end; Multitasking; Multithreaded architecture; NetWare 3.x operating system; Netware Loadable Modules; Networked system", thesaurus = "Distributed processing; File servers", } @Article{Day:1992:INC, author = "Michael Day", title = "Implementing {NLM-Based} Client\slash Server Architectures", journal = j-DDJ, volume = "17", number = "10", pages = "78--84", month = oct, year = "1992", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Tue Sep 03 09:15:34 1996", bibsource = "http://www.ddj.com/index/author/index.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; UnCover database", abstract = "NetWare NLMs take full advantage of the multitasking, multithreaded architecture of the operating system. Michael presents a distributed file manager made up of two modules: ENGINE.NLM, an NLM running on a NetWare 3.x server, and CLIENT.EXE, a DOS-based front end running on the client.", acknowledgement = ack-nhfb, classification = "C6150N (Distributed systems)", fjournal = "Dr. Dobb's Journal of Software Tools", keywords = "32-Bit protected-mode programs; Client/server architectures; Distributed file manager; DOS-based front end; Multitasking; Multithreaded architecture; NetWare 3.x operating system; Netware Loadable Modules; Networked system", thesaurus = "Distributed processing; File servers", } @Article{DHollander:1992:PLL, author = "Erik H. D'Hollander", title = "Partitioning and labeling of loops by unimodular transformations", journal = j-IEEE-TRANS-PAR-DIST-SYS, volume = "3", number = "4", pages = "465--476", month = jul, year = "1992", CODEN = "ITDSEO", DOI = "https://doi.org/10.1109/71.149964", ISSN = "1045-9219 (print), 1558-2183 (electronic)", ISSN-L = "1045-9219", MRclass = "68Q10 (68Q22)", MRnumber = "93f:68030", bibdate = "Mon Apr 14 07:37:07 1997", bibsource = "Compendex database; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, affiliation = "Dept of Electr Eng, State Univ of Ghent, Belgium", classification = "722; 723; C4240P (Parallel programming and algorithm theory); C6110P (Parallel programming); C6150C (Compilers, interpreters and other processors)", corpsource = "Dept. of Electr. Eng., State Univ. of Ghent, Belgium", fjournal = "IEEE Transactions on Parallel and Distributed Systems", journal-URL = "http://www.computer.org/tpds/archives.htm", journalabr = "IEEE Trans Parallel Distrib Syst", keywords = "computational complexity; Computer Programming --- Algorithms; Computer Systems Programming; constant dependence vectors; dependence matrix; dependent iterations; do-loops; fold nested loop; independent subsets; invariant dependence; join; labelling algorithm; loop labelling; loop partitioning; Multiprocessing Programs; multithreaded dynamic scheduling; n-; parallel; parallel algorithms; parallel DO-ALL loops; partitioning algorithm; Partitioning Algorithms; primitive; program compilers; Program Transformations; programming; programming theory; relation; scheduling; serial loop; transformation; unimodular; Unimodular Transformations; unimodular transformations", treatment = "T Theoretical or Mathematical", } @MastersThesis{Donalson:1992:DDP, author = "Douglas Dale Donalson", title = "{DISC}: a dynamic performance evaluation of a multi-thread architecture", type = "Thesis ({M.S.})", school = "Electrical and Computer Engineering Department, University of California, Santa Barbara", address = "Santa Barbara, CA, USA", pages = "ix + 88", year = "1992", LCCN = "TK174.C2 S25 DOND 1992", bibdate = "Sat Apr 20 11:18:53 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @InProceedings{Eykholt:1992:BMM, author = "J. R. Eykholt and S. R. Kleiman and S. Barton and R. Faulkner and D. Stein and M. Smith and A. Shivalingiah and J. Voll and M. Weeks and D. Williams", title = "Beyond Multiprocessing: Multithreading the {System V Release} 4 Kernel", crossref = "USENIX:1992:PSU", pages = "11--18", month = "Summer", year = "1992", bibdate = "Fri Oct 18 07:24:24 MDT 1996", bibsource = "ftp://ftp.uu.net/library/bibliography; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, affiliation = "SunSoft Inc.", } @TechReport{Felten:1992:IPM, author = "Edward W. Felten and Dylan James McNamee", title = "Improving the performance of message-passing applications by multithreading", type = "Technical report", number = "92-09-07", institution = "University of Washington, Dept. of Computer Science and Engineering", address = "Seattle, WA, USA", pages = "6", year = "1992", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Achieving maximum performance in message-passing programs requires that calculation and communication be overlapped. However, the program transformations required to achieve this overlap are error-prone and add significant complexity to the application program. We argue that calculation/communication overlap can be achieved easily and consistently by executing multiple threads of control on each processor, and that this approach is practical on message-passing architectures without any special hardware support. We present timing data for a typical message-passing application, to demonstrate the advantages of our scheme.", acknowledgement = ack-nhfb, annote = "Supported in part by the National Science Foundation. Supported in part by the Washington Technology Center, Digital Equipment Corporation, Apple Computer Company, a Mercury Seven Fellowship and an AT\&T Ph.D. Scholarship", keywords = "Operating systems", } @TechReport{Gokhale:1992:ICI, author = "Maya B. Gokhale and William W. Carlson", title = "An introduction to compilation issues for parallel machines", type = "Technical report", number = "SRC-TR-92-062", institution = inst-SRC-IDA, address = inst-SRC-IDA:adr, pages = "38", day = "8", month = sep, year = "1992", bibdate = "Fri Aug 30 08:01:51 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The exploitation of today's high-performance computer systems requires the effective use of parallelism in many forms and at numerous levels. This survey article discusses program analysis and restructuring techniques that target parallel architectures. We first describe various categories of architectures that are oriented toward parallel computation models: vector architectures, shared memory multiprocessors, massively parallel machines, message-passing architectures, VLIWs, and multithreaded architectures. We then describe a variety of optimization techniques that can be applied to sequential programs to effectively utilize the vector and parallel processing units. After an overview of basic dependence analysis, we present restructuring transformations on DO loops targeted both to vectorization and to concurrent execution, interprocedural and pointer analysis, task scheduling, instruction level parallelization, and compiler-assisted data placement. We conclude that although tremendous advances have been made in dependence theory and in the development of a `toolkit' of transformations, parallel systems are used most effectively when the programmer interacts in the optimization process.", acknowledgement = ack-nhfb, keywords = "Compilers (Computer programs); Computer architecture; Parallel processing (Electronic computers)", } @Article{Govindarajan:1992:LCM, author = "R. Govindarajan and S. S. Nemawarkar", title = "A Large Context Multithreaded Architecture", journal = j-LECT-NOTES-COMP-SCI, volume = "634", pages = "423--??", year = "1992", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Mon May 13 11:46:24 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @TechReport{Haines:1992:SMC, author = "Matt Haines and Anton Pedro Willem Bohm", title = "Software multithreading in a conventional distributed memory multiprocessor", type = "Technical report", number = "CS-92-126", institution = "Colorado State University, Dept. of Computer Science", address = "Fort Collins, CO, USA", pages = "25", day = "25", month = sep, year = "1992", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Today's most powerful computers are distributed memory multiprocessors. Although they possess massive amounts of available resources, it is often difficult to exploit these resources efficiently. Compilers that can cope with the complexities of these systems are being constructed, but their scope of effect is often limited due to the complexity of the analysis and the lack of runtime information. Novel architectures that can better tolerate latencies are under construction, but their effectiveness is unproven, and they do little to ease the burden on current commercial machines. Therefore we are designing a runtime system, called VISA, that attempts to avoid and tolerate latencies on conventional distributed memory multiprocessors, as well as provide a single addressing space to ease the burden of programming or code generation. The goal of our runtime system is to serve as a tool for studying the effects of latency avoidance and latency tolerance on programs running on these conventional architectures. In this paper we describe the design and implementation of multithreading in the VISA runtime system for the purpose of latency tolerance. In particular, we examine machine-independent designs for thread representation, thread switching, and split-phased transactions. We quantify the cost of multithreading for our environment, present a test program for which multithreading degrades performance, and present a program for which multithreading enhances performance.", acknowledgement = ack-nhfb, annote = "Supported in part by a grant from Sandia National Laboratories", keywords = "Multiprocessors", } @Article{Halladay:1992:PUM, author = "Steve Halladay and Michael Wiebel", title = "A Practical Use For Multiple Threads", journal = j-CUJ, volume = "10", number = "1", pages = "73--??", month = jan, year = "1992", ISSN = "0898-9788", bibdate = "Fri Aug 30 16:52:23 MDT 1996", bibsource = "http://www.cuj.com/cbklist.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "C Users Journal", } @Article{Hirata:1992:EPA, author = "Hiroaki Hirata and Kozo Kimura and Satoshi Nagamine and Yoshiyuki Mochizuki and Akio Nishimura and Yoshimori Nakase and Teiji Nishizawa", title = "An elementary processor architecture with simultaneous instruction issuing from multiple threads", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "136--145", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Hirata:1992:MPA, author = "H. Hirata and Y. Mochizuki and A. Nishmura and Y. Nakase and T. Nishizawa", title = "A multithreaded processor architecture with simultaneous instruction issuing", journal = j-SUPERCOMPUTER, volume = "9", number = "3", pages = "23--39", month = may, year = "1992", CODEN = "SPCOEL", ISSN = "0168-7875", bibdate = "Wed Mar 18 08:37:01 MST 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, affiliation = "Media Res. Lab., Matsushita Electr. Ind. Co., Osaka, Japan", classification = "C5220P (Parallel architecture); C6110P (Parallel programming); C6150J (Operating systems)", corpsource = "Media Res. Lab., Matsushita Electr. Ind. Co., Osaka, Japan", fjournal = "Supercomputer", keywords = "functional unit; independent instruction streams; multiprogramming; multithreaded processor architecture; parallel processing; scheduling; simultaneous instruction issuing; vector machines; VLW machines", pubcountry = "Netherlands", treatment = "P Practical", } @InProceedings{Hironaka:1992:BVP, author = "T. Hironaka and T. Hashimoto and K. Okazaki and K. Murakami", title = "Benchmarking a Vector-Processor Prototype Based on Multithreaded Streaming\slash {FIFO} Vector ({MSFV}) Architecture", crossref = "ACM:1992:CPI", pages = "272--281", year = "1992", bibdate = "Mon Aug 26 10:38:41 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Hum:1992:HSM, author = "Herbert H. J. Hum and Guang R. Gao", title = "A high-speed memory organization for hybrid dataflow\slash {von Neumann} computing", journal = j-FUT-GEN-COMP-SYS, volume = "8", number = "4", pages = "287--301", month = sep, year = "1992", CODEN = "FGSEVI", ISSN = "0167-739X (print), 1872-7115 (electronic)", ISSN-L = "0167-739X", bibdate = "Fri Jul 15 09:06:02 MDT 2005", bibsource = "ftp://ftp.ira.uka.de/bibliography/Os/threads.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.sciencedirect.com/science/journal/0167739X", abstract = "The paper proposes a novel organization of high-speed memories, known as the register-cache, for a multi-threaded architecture. Viewed from the execution unit, its contents are addressable as ordinary CPU registers using relatively short addresses. From the main memory perspective, it is content addressable. In this register-cache organization, a number of registers are grouped into a block of registers where a register in a block is accessed using an offset from the address of the block, an offset value which is embedded in the compiler generated code. The binding of register block locations to register-cache line addresses is adaptively performed at runtime, thus resulting in a dynamically allocated register file. In this execution model, a program is compiled into a number of instruction threads called super-actors. A super-actor becomes ready for execution only when its input data are physically residing in the register-cache and space is reserved in the register-cache to store its result.", acknowledgement = ack-nhfb, fjournal = "Future Generation Computer Systems", journal-URL = "http://www.sciencedirect.com/science/journal/0167739X", } @Article{Jagannathan:1992:CSC, author = "Suresh Jagannathan and Jim Philbin", title = "A customizable substrate for concurrent languages", journal = j-SIGPLAN, volume = "27", number = "7", pages = "55--67", month = jul, year = "1992", CODEN = "SINODQ", ISBN = "0-89791-475-9", ISBN-13 = "978-0-89791-475-8", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", LCCN = "QA76.7.S53 1992", bibdate = "Sun Dec 14 09:16:22 MST 2003", bibsource = "Compendex database; http://www.acm.org/pubs/contents/proceedings/pldi/143095/index.html; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/proceedings/pldi/143095/p55-jagannathan/", abstract = "We describe an approach to implementing a wide-range of concurrency paradigms in high-level (symbolic) programming languages. The focus of our discussion is STING, a dialect of Scheme, that supports lightweight threads of control and virtual processors as first-class objects. Given the significant degree to which the behavior of these objects may be customized, we can easily express a variety of concurrency paradigms and linguistic structures within a common framework without loss of efficiency. Unlike parallel systems that rely on operating system services for managing concurrency, STING implements concurrency management entirely in terms of Scheme objects and procedures. It, therefore, permits users to optimize the runtime behavior of their applications without requiring knowledge of the underlying runtime system. This paper concentrates on (a) the implications of the design for building asynchronous concurrency structures, (b) organizing large-scale concurrent computations, and (c) implementing robust programming environments for symbolic computing.", acknowledgement = ack-nhfb, affiliation = "NEC Research Inst", affiliationaddress = "Princeton, NJ, USA", annote = "Published as part of the Proceedings of PLDI'92.", classification = "723.1", conference = "Proceedings of the ACM SIGPLAN '92 Conference on Programming Language Design and Implementation", conferenceyear = "1992", fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", journalabr = "SIGPLAN Not", keywords = "algorithms; Computer programming languages; Concurrency paradigms; Concurrency structures; design; languages; Parallel processing systems; performance; Robust programming; Symbolic programming languages", meetingaddress = "San Francisco, CA, USA", meetingdate = "Jun 17--19 1992", meetingdate2 = "06/17--19/92", sponsor = "ACM", subject = "{\bf D.3.2} Software, PROGRAMMING LANGUAGES, Language Classifications, Concurrent, distributed, and parallel languages. {\bf D.3.2} Software, PROGRAMMING LANGUAGES, Language Classifications, SCHEME. {\bf D.1.3} Software, PROGRAMMING TECHNIQUES, Concurrent Programming, Parallel programming.", } @Article{Koopman:1992:CBC, author = "Philip J. {Koopman, Jr.} and Peter Lee and Daniel P. Siewiorek", title = "Cache Behavior of Combinator Graph Reduction", journal = j-TOPLAS, volume = "14", number = "2", pages = "265--297", month = apr, year = "1992", CODEN = "ATPSDT", ISSN = "0164-0925 (print), 1558-4593 (electronic)", ISSN-L = "0164-0925", bibdate = "Sat Jan 06 14:28:31 1996", bibsource = "Compiler/Compiler.Lins.bib; Compiler/garbage.collection.bib; Compiler/Heaps.bib; Compiler/TOPLAS.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; Theory/CLiCS.bib", note = "Also see~\cite{Koopman:1992:CBC}.", URL = "http://www.acm.org/pubs/toc/Abstracts/0164-0925/128867.html", abstract = "The results of cache-simulation experiments with an abstract machine for reducing combinator graphs are presented. The abstract machine, called TIGRE, exhibits reduction rates that, for similar kinds of combinator graphs on similar kinds of hardware, compare favorably with previously reported techniques. Furthermore, TIGRE maps easily and efficiently onto standard computer architectures, particularly those that allow a restricted form of self-modifying code. This provides some indication that the conventional ``stored program'' organization of computer systems is not necessarily an inappropriate one for functional programming language implementations.\par This is not to say, however, that present day computer systems are well equipped to reduce combinator graphs. In particular, the behavior of the cache memory has a significant effect on performance. In order to study and quantify this effect, trace-driven cache simulations of a TIGRE graph reducer running on a reduced instruction-set computer are conducted. The results of these simulations are presented with the following hardware-cache parameters varied: cache size, block size, associativity, memory update policy, and write-allocation policy. To begin with, the cache organization of a commercially available system is used and then the performance sensitivity with respect to variations of each parameter are measured. From the results of the simulation study, a conclusion is made that combinator-graph reduction using TIGRE runs most efficiently when using a cache memory with an allocate-on-write-miss strategy, moderately large block size (preferably with subblock placement), and copy-back memory updates.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Programming Languages and Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783", keywords = "algorithms; languages; performance; theory; threading", sjb = "In amongst all the cache stuff is a description of how subroutine threading can form the basis for a relatively efficient method of performing combinator graph reduction.", subject = "{\bf B.3.2}: Hardware, MEMORY STRUCTURES, Design Styles, Cache memories. {\bf B.3.3}: Hardware, MEMORY STRUCTURES, Performance Analysis and Design Aids, Simulation. {\bf D.1.1}: Software, PROGRAMMING TECHNIQUES, Applicative (Functional) Programming. {\bf D.3.2}: Software, PROGRAMMING LANGUAGES, Language Classifications, Applicative languages. {\bf D.3.4}: Software, PROGRAMMING LANGUAGES, Processors, Compilers. {\bf D.3.4}: Software, PROGRAMMING LANGUAGES, Processors, Interpreters. {\bf G.2.1}: Mathematics of Computing, DISCRETE MATHEMATICS, Combinatorics.", } @Article{Kuchlin:1992:MTC, author = "W. Kuchlin", title = "On the Multi-Threaded Computation of Modular Polynomial Greatest Common Divisors", journal = j-LECT-NOTES-COMP-SCI, volume = "591", pages = "369--??", year = "1992", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Mon May 13 11:46:24 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Lenir:1992:EIL, author = "Philip Lenir and R. Govindarajan and S. S. Nemawarkar", title = "Exploiting instruction-level parallelism: the multithreaded approach", journal = j-SIGMICRO, volume = "23", number = "1--2", pages = "189--192", month = dec, year = "1992", DOI = "https://doi.org/10.1145/144965.145798", bibdate = "Fri Apr 16 10:27:43 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigmicro.bib", URL = "https://dl.acm.org/doi/10.1145/144965.145798", acknowledgement = ack-nhfb, fjournal = "ACM SIGMICRO Newsletter", journal-URL = "https://dl.acm.org/loi/sigmicro", } @Article{LeSergent:1992:IMT, author = "T. {Le Sergent} and B. Berthomieu", title = "Incremental Multi-Threaded Garbage Collection on Virtually Shared Memory Architectures", journal = j-LECT-NOTES-COMP-SCI, volume = "637", pages = "179--??", year = "1992", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Mon May 13 11:46:24 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Nikhil:1992:MMP, author = "R. S. Nikhil and G. M. Papadopoulos and Arvind", title = "{T}: a multithreaded massively parallel architecture", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "156--167", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Ogata:1992:DIH, author = "Kazuhiro Ogata and Satoshi Kurihara and Mikio Inari and Norihisa Doi", title = "The design and implementation of {HoME}", journal = j-SIGPLAN, volume = "27", number = "7", pages = "44--54", month = jul, year = "1992", CODEN = "SINODQ", ISBN = "0-89791-475-9", ISBN-13 = "978-0-89791-475-8", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", LCCN = "QA76.7.S53 1992", bibdate = "Sun Dec 14 09:16:22 MST 2003", bibsource = "Compendex database; http://www.acm.org/pubs/contents/proceedings/pldi/143095/index.html; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/proceedings/pldi/143095/p44-ogata/", abstract = "HoME is a version of Smalltalk which can be efficiently executed on a multiprocessor and can be executed in parallel by combining a Smalltalk process with a Mach thread and executing the process on the thread. HoME is nearly the same as ordinary Smalltalk except that multiple processes may execute in parallel. Thus, almost all applications running on ordinary Smalltalk can be executed on HoME without changes in their code. HoME was designed and implemented based on the following fundamental policies: (1) theoretically, an infinite number of processes can become active; (2) the moment a process is scheduled, it becomes active; (3) no process switching occurs; (4) HoME is equivalent to ordinary Smalltalk except for the previous three policies. The performance of the current implementation of HoME running on OMRON LUNA-88K, which had four processors, was measured by benchmarks which execute in parallel with multiple processes. In all benchmarks, the results showed that HoME's performance is much better than HPS on the same workstation.", acknowledgement = ack-nhfb, affiliation = "Keio Univ", affiliationaddress = "Yokohama, Jpn", annote = "Published as part of the Proceedings of PLDI'92.", classification = "723.1", conference = "Proceedings of the ACM SIGPLAN '92 Conference on Programming Language Design and Implementation", conferenceyear = "1992", fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", journalabr = "SIGPLAN Not", keywords = "Computer programming; design; HPS on Mach environment; languages; measurement; Object oriented programming; performance; Smalltalk", meetingaddress = "San Francisco, CA, USA", meetingdate = "Jun 17--19 1992", meetingdate2 = "06/17--19/92", sponsor = "ACM", subject = "{\bf D.1.3} Software, PROGRAMMING TECHNIQUES, Concurrent Programming. {\bf D.3.4} Software, PROGRAMMING LANGUAGES, Processors. {\bf D.3.2} Software, PROGRAMMING LANGUAGES, Language Classifications, Smalltalk. {\bf D.2.8} Software, SOFTWARE ENGINEERING, Metrics, Performance measures.", } @InProceedings{Papadopoulos:1992:MCS, author = "G. M. Papadopoulos and A. P. W. Bohm and A. T. Dahbura and R. R. Oldehoeft", title = "Multithreaded computer systems", crossref = "IEEE:1992:PSM", pages = "772--775", year = "1992", bibdate = "Wed Apr 15 15:37:20 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classification = "C5220P (Parallel architecture)", corpsource = "Lab. for Comput. Sci., MIT, Cambridge, MA, USA", keywords = "architectural principles; data matching; multithreaded computer systems; parallel architectures; parallel machines; split-phase memory accesses", sponsororg = "IEEE; ACM", treatment = "P Practical", } @InProceedings{Peacock:1992:EMS, author = "J. Kent Peacock and Sunil Saxena and Dean Thomas and Fred Yang and Wilfred Yu", title = "Experiences from Multithreading System {V} Release 4", crossref = "USENIX:1992:SED", pages = "77--92", day = "26--27", month = mar, year = "1992", bibdate = "Fri Oct 18 07:24:24 MDT 1996", bibsource = "ftp://ftp.uu.net/library/bibliography; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, affiliation = "Intel Multiprocessor Consortium", } @InProceedings{Peacock:1992:FSM, author = "J. Kent Peacock", title = "File System Multithreading in {System V Release} 4 {MP}", crossref = "USENIX:1992:PSU", pages = "19--30", month = "Summer", year = "1992", bibdate = "Tue Feb 20 15:42:13 MST 1996", bibsource = "ftp://ftp.uu.net/library/bibliography; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, affiliation = "Intel Multi-Processor Consortium", } @InProceedings{Pham:1992:MDA, author = "Thuan Q. Pham and Pankaj K. Garg", title = "On Migrating a Distributed Application to a Multithreaded Environment", crossref = "USENIX:1992:PSU", pages = "45--54", month = "Summer", year = "1992", bibdate = "Fri Oct 18 07:24:24 MDT 1996", bibsource = "ftp://ftp.uu.net/library/bibliography; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, affiliation = "Hewlett--Packard Laboratories", } @Article{Sato:1992:TBP, author = "Mitsuhisa Sato and Yuetsu Kodama and Shuichi Sakai and Yoshinori Yamaguchi and Yasuhito Koumura", title = "Thread-based programming for the {EM-4} hybrid dataflow machine", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "146--155", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Schwan:1992:MRT, author = "Karsten Schwan and Hongyi Zhou", title = "Multiprocessor real-time threads", journal = j-OPER-SYS-REV, volume = "26", number = "1", pages = "54--65", month = jan, year = "1992", CODEN = "OSRED8", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:36 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @Article{Singh:1992:DRS, author = "Gurjot Singh and Moses Joseph and Dave Barnett", title = "Debugging real-time systems", journal = j-DDJ, volume = "17", number = "9", pages = "70, 72, 74, 76--77, 116--117", month = sep, year = "1992", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Tue Sep 10 10:06:23 MDT 1996", bibsource = "http://www.ddj.com/index/author/index.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; UnCover database", abstract = "Modular and incremental development and debugging lead to reliable real-time systems that perform the functions they're designed to. Our authors use this approach when building a simulated data-acquisition system.", acknowledgement = ack-nhfb, affiliation = "Lynx Real-Time Syst., Los Gatos, CA, USA", classification = "C6150G (Diagnostic, testing, debugging and evaluating systems)", fjournal = "Dr. Dobb's Journal of Software Tools", keywords = "Correctness; Debugging cycle; Ldb; POSIX; Real-time systems; User-friendly multithreaded debugger; Worst-case performance", thesaurus = "C listings; Program debugging; Real-time systems", } @Article{Singh:1992:DRT, author = "Gurjot Singh and Moses Joseph and Dave Barnett", title = "Debugging real-time systems", journal = j-DDJ, volume = "17", number = "9", pages = "70, 72, 74, 76--77, 116--117", month = sep, year = "1992", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Tue Sep 10 10:06:23 MDT 1996", bibsource = "http://www.ddj.com/index/author/index.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; UnCover database", abstract = "Modular and incremental development and debugging lead to reliable real-time systems that perform the functions they're designed to. Our authors use this approach when building a simulated data-acquisition system.", acknowledgement = ack-nhfb, affiliation = "Lynx Real-Time Syst., Los Gatos, CA, USA", classification = "C6150G (Diagnostic, testing, debugging and evaluating systems)", fjournal = "Dr. Dobb's Journal of Software Tools", keywords = "Correctness; Debugging cycle; Ldb; POSIX; Real-time systems; User-friendly multithreaded debugger; Worst-case performance", thesaurus = "C listings; Program debugging; Real-time systems", } @Article{Smith:1992:MTX, author = "John Allen Smith", title = "The Multi-Threaded {X} Server", journal = j-X-RESOURCE, volume = "1", number = "1", pages = "73--89", month = jan, year = "1992", CODEN = "XRESEA", ISBN = "0-937175-96-X", ISBN-13 = "978-0-937175-96-5", ISSN = "1058-5591", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "The X Resource", } @PhdThesis{Young-Myers:1992:DTC, author = "Helene Wen-Hsin Young-Myers", title = "Database transitive closure: a performance study of multithreaded algorithms", type = "Thesis ({Ph.D.})", school = "College of Business and Management, University of Maryland at College Park", address = "College Park, MD, USA", pages = "ix + 198", year = "1992", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Agarwal:1993:SMV, author = "Anant Agarwal and Jonathan Babb and David Chaiken and Godfrey D'Souza and Kirk Johnson and David Kranz and John Kubiatowicz and Beng-Hong Lim and Gino Maa and Ken Mackenzie", title = "Sparcle: a Multithreaded {VLSI} Processor for Parallel Computing", journal = j-LECT-NOTES-COMP-SCI, volume = "748", pages = "359--??", year = "1993", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Mon May 13 11:49:00 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Bic:1993:EUI, author = "Lubomir Bic and Mayez Al-Mouhamed", title = "The {EM-4} under Implicit Parallelism", journal = j-J-PAR-DIST-COMP, volume = "19", number = "3", pages = "255--261", month = nov, year = "1993", CODEN = "JPDCER", DOI = "https://doi.org/10.1006/jpdc.1993.1109", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:18:53 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1993.1109/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1993.1109/production/pdf", acknowledgement = ack-nhfb, classification = "C5220P (Parallel architecture); C6110P (Parallel programming)", corpsource = "Dept. of Inf. and Comput. Sci., California Univ., Irvine, CA, USA", fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315", keywords = "analysis; benchmark programs; data distribution; data-dependency; Data-Distributed Execution; DDE; EM-4; implicit parallelism; interprocessor communication; iteration-level parallelism; loops; multithreading; parallel architectures; parallel programming; parallelization", treatment = "P Practical; T Theoretical or Mathematical", } @InProceedings{Blumofe:1993:SES, author = "Robert D. Blumofe and Charles E. Leiserson", title = "Space-efficient scheduling of multithreaded computations", crossref = "ACM:1993:PTF", pages = "362--371", year = "1993", bibdate = "Wed Feb 20 18:34:01 MST 2002", bibsource = "http://www.acm.org/pubs/contents/proceedings/series/stoc/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org/pubs/articles/proceedings/stoc/167088/p362-blumofe/p362-blumofe.pdf; http://www.acm.org/pubs/citations/proceedings/stoc/167088/p362-blumofe/", acknowledgement = ack-nhfb, } @PhdThesis{Boothe:1993:EMC, author = "Bob Boothe", title = "Evaluation of multithreading and caching in large shared memory parallel computers", type = "Thesis ({Ph.D.})", school = "University of California, Berkeley, Computer Science Division", address = "Berkeley, CA, USA", pages = "ix + 169", month = jul, year = "1993", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "Also available as Report UCB/CSD 93/766.", acknowledgement = ack-nhfb, annote = "Supported in part by the Air Force Office of Scientific Research (AFOSR/JSEP), by the NSF, and by an NSF Infrastructure Grant.", keywords = "Multiprocessors", } @MastersThesis{Chong:1993:EMC, author = "Yong-Kim Chong", title = "Effects of memory consistency models on multithreaded multiprocessor performance", type = "Thesis ({M.S.})", school = "University of Southern California", address = "Los Angeles, CA, USA", pages = "viii + 89", year = "1993", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Culler:1993:TCC, author = "David E. Culler and Seth Copen Goldstein and Klaus Erik Schauser and Thorsten {Von Eicken}", title = "{TAM} -- a Compiler Controlled {Threaded Abstract Machine}", journal = j-J-PAR-DIST-COMP, volume = "18", number = "3", pages = "347--370", month = jul, year = "1993", CODEN = "JPDCER", DOI = "https://doi.org/10.1006/jpdc.1993.1070", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:18:52 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1993.1070/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1993.1070/production/pdf", acknowledgement = ack-nhfb, classification = "C5220P (Parallel architecture)", corpsource = "Div. of Comput. Sci., California Univ., Berkeley, CA, USA", fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315", keywords = "dataflow execution models; parallel architectures; parallel programming; parallel threads; self-scheduled machine language; Threaded Abstract Machine", treatment = "P Practical", } @Article{Dillon:1993:VEM, author = "Laura K. Dillon", title = "A visual execution model for {Ada} tasking", journal = j-TOSEM, volume = "2", number = "4", pages = "311--345", month = oct, year = "1993", CODEN = "ATSMER", ISSN = "1049-331X (print), 1557-7392 (electronic)", ISSN-L = "1049-331X", bibdate = "Fri Apr 20 08:21:35 MDT 2001", bibsource = "http://www.acm.org/pubs/toc/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org/pubs/articles/journals/tosem/1993-2-4/p311-dillon/p311-dillon.pdf; http://www.acm.org/pubs/citations/journals/tosem/1993-2-4/p311-dillon/", abstract = "A visual execution model for Ada tasking can help programmers attain a deeper understanding of the tasking semantics. It can illustrate subtleties in semantic definitions that are not apparent in natural language design. We describe a contour model of Ada tasking that depicts asynchronous tasks (threads of control), relationships between the environments in which tasks execute, and the manner in which tasks interact. The use of this high-level execution model makes it possible to see what happens during execution of a program. The paper provides an introduction to the contour model of Ada tasking and demonstrates its use.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Software Engineering and Methodology", generalterms = "Algorithms; Design; Languages", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J790", keywords = "contour model; visual execution model", subject = "Software --- Software Engineering --- Design Tools and Techniques (D.2.2); Software --- Software Engineering --- Programming Environments (D.2.6); Software --- Programming Languages --- Formal Definitions and Theory (D.3.1): {\bf Semantics}; Software --- Programming Languages --- Language Classifications (D.3.2): {\bf Ada}; Software --- Programming Languages --- Language Constructs and Features (D.3.3): {\bf Concurrent programming structures}; Software --- Programming Techniques --- Concurrent Programming (D.1.3); Theory of Computation --- Logics and Meanings of Programs --- Semantics of Programming Languages (F.3.2): {\bf Operational semantics}; Software --- Programming Languages --- Processors (D.3.4): {\bf Interpreters}", } @InProceedings{Doligez:1993:CGG, author = "Damien Doligez and Xavier Leroy", title = "A concurrent, generational garbage collector for a multithreaded implementation of {ML}", crossref = "ACM:1993:CRT", pages = "113--123", year = "1993", bibdate = "Mon May 3 12:45:53 MDT 1999", bibsource = "http://www.acm.org/pubs/toc/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/proceedings/plan/158511/p113-doligez/", abstract = "This paper presents the design and implementation of a ``quasi real-time'' garbage collector for Concurrent Caml Light, an implementation of ML with threads. This two-generation system combines a fast, asynchronous copying collector on the young generation with a non-disruptive concurrent marking collector on the old generation. This design crucially relies on the ML compile-time distinction between mutable and immutable objects.", acknowledgement = ack-nhfb, keywords = "algorithms; design; experimentation; languages; performance", subject = "{\bf D.3.3} Software, PROGRAMMING LANGUAGES, Language Constructs and Features, Concurrent programming structures. {\bf D.3.4} Software, PROGRAMMING LANGUAGES, Processors, Compilers. {\bf D.3.2} Software, PROGRAMMING LANGUAGES, Language Classifications, LML.", } @Article{Eager:1993:CER, author = "Derek L. Eager and John Jahorjan", title = "Chores: Enhanced Run-Time Support for Shared-Memory Parallel Computing", journal = j-TOCS, volume = "11", number = "1", pages = "1--32", month = feb, year = "1993", CODEN = "ACSYEC", ISSN = "0734-2071 (print), 1557-7333 (electronic)", ISSN-L = "0734-2071", bibdate = "Wed Jan 13 18:36:53 MST 1999", bibsource = "http://www.acm.org/pubs/contents/journals/tocs/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/journals/tocs/1993-11-1/p1-eager/", abstract = "Parallel computing is increasingly important in the solution of large-scale numerical problems. The difficulty of efficiently hand-coding parallelism, and the limitations of parallelizing compilers, have nonetheless restricted its use by scientific programmers. In this paper we propose a new paradigm, {\em chores}, for the run-time support of parallel computing on shared-memory multiprocessors. We consider specifically uniform memory access shared-memory environments, although the chore paradigm should also be appropriate for use within the clusters of a large-scale nonuniform memory access machine. We argue that chore systems attain both the high efficiency of compiler approaches for the common case of data parallelism, and the flexibility and performance of user-level thread approaches for functional parallelism. These benefits are achieved within a single, simple conceptual model that almost entirely relieves the programmer and compiler from concerns of granularity, scheduling, and enforcement of synchronization constraints. Measurements of a prototype implementation demonstrate that the chore model can be supported more efficiently than can traditional approaches to either data or functional parallelism alone.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Computer Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J774", keywords = "design; measurement; performance", subject = "{\bf D.4.1} Software, OPERATING SYSTEMS, Process Management. {\bf D.4.9} Software, OPERATING SYSTEMS, Systems Programs and Utilities. {\bf D.4.7} Software, OPERATING SYSTEMS, Organization and Design, Distributed systems. {\bf C.3} Computer Systems Organization, SPECIAL-PURPOSE AND APPLICATION-BASED SYSTEMS. {\bf C.4} Computer Systems Organization, PERFORMANCE OF SYSTEMS.", } @MastersThesis{Estep:1993:LMM, author = "James L. Estep", title = "Lightweight multithreaded multimedia conference server", type = "Thesis ({M.S.})", school = "West Virginia University", address = "Morgantown, WV, USA", pages = "vi + 57", year = "1993", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Electronic data processing -- Distributed processing; Multimedia systems", } @PhdThesis{Fan:1993:LMC, author = "Xiaoming Fan", title = "Latency-directed multithreaded computation and its architectural support", type = "Thesis ({Ph.D.})", school = "Universit{\"a}t Hamburg", address = "Aachen, Germany", pages = "xi + 174 + 22 + 11", year = "1993", ISBN = "3-8265-0021-0", ISBN-13 = "978-3-8265-0021-3", ISSN = "0945-0807", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "Summary in German.", series = "Berichte aus der Informatik", acknowledgement = ack-nhfb, keywords = "Computer architecture; Parallel processing (Electronic computers)", } @Article{Gao:1993:DMA, author = "Guang Gao and Jean-Luc Gaudiot and Lubomir Bic", title = "Dataflow and Multithreaded Architectures: {Guest Editors}' Introduction", journal = j-J-PAR-DIST-COMP, volume = "18", number = "3", pages = "271--??", month = jul, year = "1993", CODEN = "JPDCER", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Sat Apr 12 16:10:59 MDT 1997", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315", xxnote = "Issue missing from UofUtah Marriott Library??", } @Article{Gao:1993:EHD, author = "G. R. Gao", title = "An Efficient Hybrid Dataflow Architecture Model", journal = j-J-PAR-DIST-COMP, volume = "19", number = "4", pages = "293--307", month = dec, year = "1993", CODEN = "JPDCER", DOI = "https://doi.org/10.1006/jpdc.1993.1113", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:18:53 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1993.1113/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1993.1113/production/pdf", acknowledgement = ack-nhfb, classification = "C5220P (Parallel architecture); C6110P (Parallel programming)C6150N (Distributed systems); C6150C (Compilers, interpreters and other processors)", corpsource = "Adv. Comput. Archit. and Program Structures Group, Montreal Univ., Que., Canada", fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315", keywords = "architecture technique; compiling paradigm; concurrent operation; conventional; data-driven instruction; data-driven scheduling scheme; dataflow computers; dataflow software pipelining; efficient hybrid dataflow architecture model; execution; fast pipelined instruction; fine-grain parallelism; hybrid; limited balancing; loop parallelism; multiple instruction; parallel architectures; parallel programming; pipeline; processing; program compilers; scheduling; simple greedy runtime; space efficiency; threads", treatment = "P Practical", } @Book{Gao:1993:SID, author = "Guang R. Gao and Jean-Luc Gaudiot and Lubomir Bic", title = "Special issue on dataflow and multithreaded architectures", publisher = pub-AP, address = pub-AP:adr, pages = "271--389", year = "1993", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = "Journal of parallel and distributed computing; v. 18, no. 3", acknowledgement = ack-nhfb, } @InProceedings{Giering:1993:IAF, author = "E. W. Giering and F. Mueller and T. P. Baker", title = "Implementing {Ada 9X} Features using {POSIX} Threads: Design Issues", crossref = "ACM:1993:TCS", pages = "214--228", year = "1993", bibdate = "Sat Jul 05 17:12:34 1997", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Gildea:1993:MTX, author = "Stephen Gildea", title = "Multi-Threaded {Xlib}", journal = j-X-RESOURCE, volume = "5", number = "1", pages = "159--166", month = jan, year = "1993", CODEN = "XRESEA", ISBN = "1-56592-020-1", ISBN-13 = "978-1-56592-020-0", ISSN = "1058-5591", bibdate = "Tue Mar 23 12:38:27 1993", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "The X Resource", } @Article{Hauser:1993:UTI, author = "Carl Hauser and Christian Jacobi and Marvin Theimer and Brent Welch and Mark Weiser", title = "Using threads in interactive systems: a case study", journal = j-OPER-SYS-REV, volume = "27", number = "5", pages = "94--105", month = dec, year = "1993", CODEN = "OSRED8", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:54 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @Article{Hayden:1993:BIC, author = "Charles Hayden", title = "A brief introduction to {Concurrent Pascal}", journal = j-SIGPLAN, volume = "28", number = "3", pages = "353--354", month = mar, year = "1993", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:16:34 MST 2003", bibsource = "http://portal.acm.org/; http://www.acm.org/pubs/toc/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/proceedings/plan/154766/p353-hayden/", abstract = "Concurrent Pascal is designed for writing concurrent programs such as operating systems and real-time monitoring systems on shared-memory computers. A separate language, Sequential Pascal, is used as the language for applications programs run by operating systems written in Concurrent Pascal. Both languages are extensions of Wirth's Pascal, and share a common threaded code interpreter. The article describes how Concurrent Pascal differs from Wirth's Pascal.", acknowledgement = ack-nhfb, affiliation = "AT and T Bell Labs., Middletown, NJ, USA", classification = "C6110P (Parallel programming); C6140D (High level languages)", confdate = "20-23 April 1993", conflocation = "Cambridge, MA, USA", confname = "HOPL-II. The second ACM SIGPLAN conference on History of programming languages, April 20--23, 1993, Cambridge, MA", confsponsor = "ACM", fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "Concurrent Pascal; languages; Operating systems; Real-time monitoring systems; Sequential Pascal; Shared-memory computers; Threaded code interpreter", subject = "{\bf D.3.2} Software, PROGRAMMING LANGUAGES, Language Classifications, Concurrent Pascal. {\bf D.3.2} Software, PROGRAMMING LANGUAGES, Language Classifications, Pascal. {\bf D.3.3} Software, PROGRAMMING LANGUAGES, Language Constructs and Features, Procedures, functions, and subroutines.", thesaurus = "Parallel languages; Pascal", } @Article{Hidaka:1993:MTC, author = "Yasuo Hidaka and Hanpei Koike and Hidehiko Tanaka", title = "Multiple threads in cyclic register windows", journal = j-COMP-ARCH-NEWS, volume = "21", number = "2", pages = "131--142", month = may, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:46 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Hsieh:1993:CME, author = "Wilson C. Hsieh and Paul Wang and William E. Weihl", title = "Computation migration: enhancing locality for distributed-memory parallel systems", journal = j-SIGPLAN, volume = "28", number = "7", pages = "239--248", month = jul, year = "1993", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:16:39 MST 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Computation migration is a technique that is based on compile-time program transformation, for accessing remote data in a distributed-memory parallel system. In contrast with RPC-style access, where the access is performed remotely, and with data migration, where the data is moved so that it is local, computation migration moves put of the current thread to the processor where the data resides. The access is performed at the remote processor, and the migrated thread portion continues to run on that same processor; this makes subsequent accesses in the thread portion local. The authors describe an implementation of computation migration that consists of two parts: a implementation that migrates single activation frames, and a high-level language annotation that allows a programmer to express when migration is desired. They performed experiments using two applications; these experiments demonstrate that computation migration is a valuable alternative to RPC and data migration.", acknowledgement = ack-nhfb, affiliation = "Lab. of Comput. Sci., MIT, Cambridge, MA, USA", classification = "C6110P (Parallel programming); C6120 (File organisation); C6150C (Compilers, interpreters and other processors)", confdate = "19-22 May 1993", conflocation = "San Diego, CA, USA", confsponsor = "ACM", fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "Compile-time program transformation; Computation migration; Current thread; Distributed-memory parallel system; High-level language annotation; Remote data; Remote processor; Single activation frames", thesaurus = "Distributed memory systems; Parallel programming; Program compilers; Storage management", } @Article{Huelsbergen:1993:CCG, author = "Lorenz Huelsbergen and James R. Larus", title = "A concurrent copying garbage collector for languages that distinguish (im)mutable data", journal = j-SIGPLAN, volume = "28", number = "7", pages = "73--82", month = jul, year = "1993", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:16:39 MST 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, affiliation = "Dept. of Comput. Sci., Wisconsin-Madison Univ., WI, USA", classification = "C6110P (Parallel programming); C6120 (File organisation); C6150C (Compilers, interpreters and other processors); C6150N (Distributed systems)", confdate = "19-22 May 1993", conflocation = "San Diego, CA, USA", confsponsor = "ACM", fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "Concurrent collection; Concurrent compacting garbage collector; Garbage-collection pauses; Immutable data; Minimal mutator/collector synchronization; Multiple mutator threads; Mutable data; Pure functional languages; Shared-memory parallel computers; Standard ML compiler", thesaurus = "Parallel programming; Program compilers; Shared memory systems; Storage allocation; Storage management", } @InProceedings{Klarlund:1993:GT, author = "Nils Klarlund and Michael I. Schwartzbach", title = "Graph types", crossref = "ACM:1993:CRT", pages = "196--205", year = "1993", bibdate = "Mon May 3 12:45:53 MDT 1999", bibsource = "http://www.acm.org/pubs/toc/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/proceedings/plan/158511/p196-klarlund/", abstract = "Recursive data structures are abstractions of simple records and pointers. They impose a shape invariant, which is verified at compile-time and exploited to automatically generate code for building, copying, comparing, and traversing values without loss of efficiency. However, such values are always tree shaped, which is a major obstacle to practical use. We propose a notion of graph types, which allow common shapes, such as doubly-linked lists or threaded trees, to be expressed concisely and efficiently. We define regular languages of routing expressions to specify relative addresses of extra pointers in a canonical spanning tree. An efficient algorithm for computing such addresses is developed. We employ a second-order monadic logic to decide well-formedness of graph type specifications. This logic can also be used for automated reasoning about pointer structures.", acknowledgement = ack-nhfb, keywords = "algorithms; languages; theory", subject = "{\bf F.3.3} Theory of Computation, LOGICS AND MEANINGS OF PROGRAMS, Studies of Program Constructs, Type structure. {\bf D.3.3} Software, PROGRAMMING LANGUAGES, Language Constructs and Features, Data types and structures. {\bf F.2.2} Theory of Computation, ANALYSIS OF ALGORITHMS AND PROBLEM COMPLEXITY, Nonnumerical Algorithms and Problems, Computations on discrete structures. {\bf G.2.2} Mathematics of Computing, DISCRETE MATHEMATICS, Graph Theory, Trees.", } @InProceedings{Koontz:1993:PBM, author = "K. W. Koontz", title = "Port buffers: a {Mach IPC} optimization for handling large volumes of small messages", crossref = "USENIX:1993:PUMb", pages = "89--102", year = "1993", bibdate = "Sat Sep 28 18:52:45 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/mach.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, affiliation = "Appl. Phys. Lab., Johns Hopkins Univ., Laurel, MD, USA", classification = "C6150N (Distributed systems)", keywords = "Communications mechanism; Context switches; Distributed systems; Ethernet; High-speed networks; Kernel calls; Local transfer rates; Mach IPC optimization; Mach kernel; Multi-threaded support; Network utilization; Nonshared memory parallel architectures; Port buffers; Staleness feature", thesaurus = "Buffer storage; Electronic messaging; Network operating systems; Optimisation; Remote procedure calls", } @Article{Lee:1993:TW, author = "David Lee", title = "Threads for {Windows} 3", journal = j-DDJ, volume = "18", number = "10", pages = "84--??", month = "Fall", year = "1993", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Tue Sep 03 09:15:44 1996", bibsource = "http://www.ddj.com/index/author/index.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; UnCover database", note = "Special Issue: Windows Sourcebook.", abstract = "Unlike NT, Windows 3 doesn't provide direct support for threads. With the techniques David illustrates here, you can implement non-preemptive threads in Windows 3.", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @Article{Lim:1993:WAS, author = "Beng-Hong Lim and Anant Agarwal", title = "Waiting Algorithms for Synchronization in Large-Scale Multiprocessors", journal = j-TOCS, volume = "11", number = "3", pages = "253--294", month = aug, year = "1993", CODEN = "ACSYEC", ISSN = "0734-2071 (print), 1557-7333 (electronic)", ISSN-L = "0734-2071", bibdate = "Wed Jan 13 18:36:53 MST 1999", bibsource = "http://www.acm.org/pubs/contents/journals/tocs/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/journals/tocs/1993-11-3/p253-lim/", abstract = "Through analysis and experiments, this paper investigates two-phase waiting algorithms to minimize the cost of waiting for synchronization in large-scale multiprocessors. In a two-phase algorithm, a thread first waits by polling a synchronization variable. If the cost of polling reaches a limit {\em Lpoll\/} and further waiting is necessary, the thread is blocked, incurring an additional fixed cost, {\em B}. The choice of {\em Lpoll\/} is a critical determinant of the performance of two-phase algorithms. We focus on methods for statically determining {\em Lpoll\/} because the run-time overhead of dynamically determining {\em Lpoll\/} can be comparable to the cost of blocking in large-scale multiprocessor systems with lightweight threads. Our experiments show that {\em always-block\/} ({\em Lpoll\/} = 0) is a good waiting algorithm with performance that is usually close to the best of the algorithms compared. We show that even better performance can be achieved with a static choice of {\em Lpoll\/} based on knowledge of likely wait-time distributions. Motivated by the observation that different synchronization types exhibit different wait-time distributions, we prove that a static choice of {\em Lpoll\/} can yield close to optimal on-line performance against an adversary that is restricted to choosing wait times from a fixed family of probability distributions. This result allows us to make an optimal static choice of {\em Lpoll\/} based on synchronization type. For exponentially distributed wait times, we prove that setting {\em Lpoll\/} = 1n(e-1){\em B\/} results in a waiting cost that is no more than {\em e/(e-1)\/} times the cost of an optimal off-line algorithm. For uniformly distributed wait times, we prove that setting {\em L\/}poll=1/2(square root of 5 -1){\em B\/} results in a waiting cost that is no more than (square root of 5 + 1)/2 (the golden ratio) times the cost of an optimal off-line algorithm. Experimental measurements of several parallel applications on the Alewife multiprocessor simulator corroborate our theoretical findings.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Computer Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J774", keywords = "algorithms; experimentation; performance; theory", subject = "{\bf D.4.1} Software, OPERATING SYSTEMS, Process Management, Synchronization. {\bf D.4.1} Software, OPERATING SYSTEMS, Process Management, Mutual exclusion. {\bf C.4} Computer Systems Organization, PERFORMANCE OF SYSTEMS. {\bf C.1.2} Computer Systems Organization, PROCESSOR ARCHITECTURES, Multiple Data Stream Architectures (Multiprocessors), Parallel processors**. {\bf D.4.8} Software, OPERATING SYSTEMS, Performance, Measurements. {\bf D.4.8} Software, OPERATING SYSTEMS, Performance, Stochastic analysis.", } @Article{McCann:1993:DPA, author = "Cathy McCann and Raj Vaswani and John Zahorjan", title = "A Dynamic Processor Allocation Policy for Multiprogrammed Shared-Memory Multiprocessors", journal = j-TOCS, volume = "11", number = "2", pages = "146--178", month = may, year = "1993", CODEN = "ACSYEC", ISSN = "0734-2071 (print), 1557-7333 (electronic)", ISSN-L = "0734-2071", bibdate = "Wed Jan 13 18:36:53 MST 1999", bibsource = "http://www.acm.org/pubs/contents/journals/tocs/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/journals/tocs/1993-11-2/p146-mccann/", abstract = "We propose and evaluate empirically the performance of a dynamic processor-scheduling policy for multiprogrammed shared-memory multiprocessors. The policy is dynamic in that it reallocates processors from one parallel job to another based on the currently realized parallelism of those jobs. The policy is suitable for implementation in production systems in that: ---It interacts well with very efficient user-level thread packages, leaving to them many low-level thread operations that do not require kernel intervention. ---It deals with thread blocking due to user I/O and page faults. ---It ensures fairness in delivering resources to jobs. ---Its performance, measured in terms of average job response time, is superior to that of previously proposed schedulers, including those implemented in existing systems. It provides good performance to very short, sequential (e.g., interactive) requests. We have evaluated our scheduler and compared it to alternatives using a set of prototype implementations running on a Sequent Symmetry multiprocessor. Using a number of parallel applications with distinct qualitative behaviors, we have both evaluated the policies according to the major criterion of overall performance and examined a number of more general policy issues, including the advantage of ``space sharing'' over ``time sharing'' the processors of a multiprocessor, and the importance of cooperation between the kernel and the application in reallocating processors between jobs. We have also compared the policies according to other criteia important in real implementations, in particular, fairness and respone time to short, sequential requests. We conclude that a combination of performance and implementation considerations makes a compelling case for our dynamic scheduling policy.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Computer Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J774", keywords = "design; measurement; performance", subject = "{\bf D.4.1} Software, OPERATING SYSTEMS, Process Management, Scheduling. {\bf D.4.1} Software, OPERATING SYSTEMS, Process Management, Multiprocessing/multiprogramming/multitasking. {\bf C.1.2} Computer Systems Organization, PROCESSOR ARCHITECTURES, Multiple Data Stream Architectures (Multiprocessors).", } @Article{Morrisett:1993:PLP, author = "J. Gregory Morrisett and Andrew P. Tolmach", title = "Procs and locks: a portable multiprocessing platform for {Standard ML} of {New Jersey}", journal = j-SIGPLAN, volume = "28", number = "7", pages = "198--207", month = jul, year = "1993", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:16:39 MST 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "A portable platform has been built for running Standard ML of New Jersey programs on multiprocessors. It can be used to implement user-level thread packages for multiprocessors within the ML language with first-class continuations. The platform supports experimentation with different thread scheduling policies and synchronization constructs. It has been used to construct a Modula-3 style thread package and a version of Concurrent ML, and has been ported to three different multiprocessors running variants of Unix. The authors describe the platform's design, implementation, and performance.", acknowledgement = ack-nhfb, affiliation = "Carnegie Mellon Univ., Pittsburg, PA, USA", classification = "C6110P (Parallel programming); C6140D (High level languages); C6150C (Compilers, interpreters and other processors)", confdate = "19-22 May 1993", conflocation = "San Diego, CA, USA", confsponsor = "ACM", fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "Concurrent ML; First-class continuations; Functional language; Modula-3 style thread package; New Jersey programs; Portable multiprocessing platform; Portable platform; Standard ML; Synchronization constructs; Thread scheduling policies; User-level thread packages", thesaurus = "Multiprocessing systems; Parallel languages; Parallel programming; Scheduling", } @Article{Najjar:1993:QAD, author = "Walid A. Najjar and A. P. Wim Bohm and W. Marcus Miller", title = "A Quantitative Analysis of Dataflow Program Execution --- Preliminaries to a Hybrid Design", journal = j-J-PAR-DIST-COMP, volume = "18", number = "3", pages = "314--326", month = jul, year = "1993", CODEN = "JPDCER", DOI = "https://doi.org/10.1006/jpdc.1993.1067", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:18:52 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1993.1067/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1993.1067/production/pdf", acknowledgement = ack-nhfb, classification = "C6110B (Software engineering techniques); C6110P (Parallel programming)", corpsource = "Dept. of Comput. Sci., Colorado State Univ., Fort Collins, CO, USA", fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315", keywords = "benchmarks; dataflow program execution; dynamic measure; fine grain intrathread locality; instruction level locality; parallel programming; software metrics", treatment = "T Theoretical or Mathematical", } @Article{Natarajan:1993:PVM, author = "Venkat Natarajan and Derek Chiou and Boon Seong Ang", title = "Performance visualization on {Monsoon}", journal = j-J-PAR-DIST-COMP, volume = "18", number = "2", pages = "169--180", month = jun, year = "1993", CODEN = "JPDCER", DOI = "https://doi.org/10.1006/jpdc.1993.1054", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:18:52 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1993.1054/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1993.1054/production/pdf", acknowledgement = ack-nhfb, classification = "C5440 (Multiprocessor systems and techniques); C5470 (Performance evaluation and testing); C7430 (Computer engineering)", corpsource = "Motorola Cambridge Res. Center, MA, USA", fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315", keywords = "algorithm; application program; compiler; computer evaluation; data analysis; data collection; data visualisation; MIT; Monsoon; Motorola; multiprocessor machine; multithreaded; operating system; parallel machine; parallel machines; performance evaluation; performance evaluation tool; programming language; visualization", treatment = "P Practical", } @InProceedings{Odersky:1993:CNA, author = "Martin Odersky and Dan Rabin and Paul Hudak", title = "Call by name, assignment, and the lambda calculus", crossref = "ACM:1993:CRT", pages = "43--56", year = "1993", bibdate = "Mon May 3 12:45:53 MDT 1999", bibsource = "http://www.acm.org/pubs/toc/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/proceedings/plan/158511/p43-odersky/", abstract = "We define an extension of the call-by-name lambda calculus with additional constructs and reduction rules that represent mutable variables and assignments. The extended calculus has neither a concept of an explicit store nor a concept of evaluation order; nevertheless, we show that programs in the calculus can be implemented using a single-threaded store. We also show that the new calculus has the Church--Rosser property and that it is a conservative extension of classical lambda calculus with respect to operational equivalence; that is, all algebraic laws of the functional subset are preserved.", acknowledgement = ack-nhfb, keywords = "languages; theory", subject = "{\bf F.4.1} Theory of Computation, MATHEMATICAL LOGIC AND FORMAL LANGUAGES, Mathematical Logic, Lambda calculus and related systems. {\bf F.3.3} Theory of Computation, LOGICS AND MEANINGS OF PROGRAMS, Studies of Program Constructs, Type structure.", } @Article{Plauger:1993:MCS, author = "Dave Plauger", title = "Making {C++} Save for Threads", journal = j-CUJ, volume = "11", number = "2", pages = "58--??", month = feb, year = "1993", ISSN = "0898-9788", bibdate = "Fri Aug 30 16:52:23 MDT 1996", bibsource = "http://www.cuj.com/cbklist.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "C Users Journal", } @InProceedings{Raghunath:1993:DIN, author = "M. T. Raghunath and Abhiram Ranade", title = "Designing Interconnection Networks for Multi-Level Packaging", crossref = "IEEE:1993:PSP", pages = "772--781", year = "1993", bibdate = "Wed Apr 15 12:04:03 MDT 1998", bibsource = "Compendex database; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, affiliation = "Univ of California", affiliationaddress = "Berkeley, CA, USA", classification = "723; C5220P (Parallel architecture); C5440 (Multiprocessing systems)", corpsource = "Comput. Sci. Div., California Univ., Berkeley, CA, USA", keywords = "communication bandwidth; complete graphs; Computer networks; generic set; global communication performance; high bandwidth channels; high degree deBruijn graphs; Interconnection network design; interconnection networks design; Large scale parallel machines; large scale parallel machines; latencies; Multilevel packaging; multilevel packaging; multiprocessor interconnection networks; multithreading; network organizations; network topology; packaging; packaging constraints; packaging hierarchy; packaging restrictions; packaging technology; Parallel processing systems; Random traffic model; random traffic model", sponsororg = "IEEE; ACM SIGARCH", treatment = "P Practical", } @MastersThesis{Rajagopal:1993:DMI, author = "Arjun Rajagopal", title = "Design of a multithreaded instruction cache for a hyperscalar processor", type = "Thesis ({M.S.})", school = "Department of Electrical Engineering, Texas A\&M University", address = "College Station, TX, USA", pages = "ix + 84", year = "1993", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Major electrical engineering", } @InProceedings{Saxena:1993:PMS, author = "Sunil Saxena and J. Kent Peacock and Fred Yang and Vijaya Verma and Mohan Krishnan", title = "Pitfalls in Multithreading {SVR4 STREAMS} and Other Weightless Processes", crossref = "USENIX:1993:PWU", pages = "85--96", month = "Winter", year = "1993", bibdate = "Tue Oct 22 08:14:49 2002", bibsource = "ftp://ftp.uu.net/library/bibliography; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.usenix.org/publications/library/proceedings/sd93/", acknowledgement = ack-nhfb, affiliation = "Intel Multiprocessor Consortium", } @InProceedings{Schmidtmann:1993:DIM, author = "Carl Schmidtmann and Michael Tao and Steven Watt", title = "Design and Implementation of a Multi-Threaded {Xlib}", crossref = "USENIX:1993:PWU", pages = "193--203", month = "Winter", year = "1993", bibdate = "Tue Oct 22 08:16:35 2002", bibsource = "ftp://ftp.uu.net/library/bibliography; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.usenix.org/publications/library/proceedings/sd93/", acknowledgement = ack-nhfb, affiliation = "Consultant to Digital Equipment Corporation; Sun Microsystems; Consultant to Xerox Corporation", } @MastersThesis{Srinivasan:1993:SDS, author = "Sumathi Srinivasan", title = "System design and simulation for the {Demus-2} multithreaded processor", type = "Thesis ({M. Eng.})", school = "Department of Electrical and Computer Engineering, McMaster University", address = "Hamilton, ON, Canada", pages = "x + 109", year = "1993", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Computer architecture; Computers, Pipeline; McMaster University. -- Dissertations; Parallel processing (Electronic computers)", } @Article{Volkman:1993:CCP, author = "Victor R. Volkman", title = "Convert {C} Programs into Multithreaded Applications", journal = j-CUJ, volume = "11", type = "User Report", number = "4", pages = "87--??", month = apr, year = "1993", ISSN = "0898-9788", bibdate = "Fri Aug 30 16:52:23 MDT 1996", bibsource = "http://www.cuj.com/cbklist.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "C Users Journal", } @Article{Volkman:1993:CDB, author = "Victor R. Volkman and John English", title = "Class {{\tt DOSThread}}: a Base Class for Multithreaded {DOS} Programs", journal = j-CUJ, volume = "11", type = "CUG library disk documentation", number = "12", pages = "113--??", month = dec, year = "1993", ISSN = "0898-9788", bibdate = "Fri Aug 30 16:52:23 MDT 1996", bibsource = "http://www.cuj.com/cbklist.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "C Users Journal", } @Article{Waldspurger:1993:RRF, author = "Carl A. Waldspurger and William E. Weihl", title = "Register relocation: flexible contexts for multithreading", journal = j-COMP-ARCH-NEWS, volume = "21", number = "2", pages = "120--130", month = may, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:46 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @TechReport{Young-Myers:1993:ESTa, author = "Helene Young-Myers and Louiqa Raschid", title = "An experimental study of three dataflow paradigms in multithreaded database transitive closure algorithms on shared memory multiprocessors", type = "Technical report", number = "CS-TR-3060; UMIACS-TR-93-33", institution = inst-U-MARYLAND, address = inst-U-MARYLAND:adr, pages = "21", month = apr, year = "1993", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "To appear in a special issue of the Journal of Parallel and Distributed Computing on Dataflow and Multithreaded Architectures, July, 1993.", abstract = "This paper describes an experimental study of three dataflow paradigms, namely, no dataflow, pipelined dataflow, and network dataflow, in multithreaded database transitive closure algorithms on shared memory multiprocessors. This study shows that dataflow paradigm directly influences performance parameters such as the amount of interthread communication, how data are partitioned among the threads, whether access to each page of data is exclusive or shared, whether locks are needed for concurrency control, and how calculation termination is detected. The algorithm designed with no dataflow outperforms the algorithms with dataflow. Approximately linear speedup is achieved by the no dataflow algorithm with sufficient workload and primary memory. An exclusive access working set model and a shared access working set model describe the interactions between two or more threads' working sets when access to each page of data is exclusive or shared among the threads, respectively. These models are experimentally verified.", acknowledgement = ack-nhfb, annote = "Supported in part by the National Science Foundation.", keywords = "Data flow computing; Multiprocessors", } @Article{Young-Myers:1993:ESTb, author = "Helene Young-Myers and Louiqa Raschid", title = "An Experimental Study of Three Dataflow Paradigms in Multithreaded Database Transitive Closure Algorithms on Shared Memory Multiprocessors", journal = j-J-PAR-DIST-COMP, volume = "18", number = "3", pages = "371--389", month = jul, year = "1993", CODEN = "JPDCER", DOI = "https://doi.org/10.1006/jpdc.1993.1071", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:18:52 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1993.1071/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1993.1071/production/pdf", acknowledgement = ack-nhfb, classification = "C5220P (Parallel architecture); C5470 (Performance evaluation and testing); C6160 (Database management systems (DBMS))", corpsource = "Maryland Univ., College Park, MD, USA", fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315", keywords = "architectures; calculation termination; concurrency control; database management systems; dataflow; dataflow paradigms; exclusive access; interthread communication; linear; network; no dataflow; parallel; performance evaluation; performance parameters; pipelined dataflow; shared access; shared memory systems; speedup", treatment = "P Practical", } @InProceedings{Alfieri:1994:EKI, author = "R. A. Alfieri", title = "An Efficient Kernel-Based Implementation of {POSIX} Threads", crossref = "Anonymous:1994:USC", pages = "59--72", year = "1994", bibdate = "Sat May 25 07:59:58 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @InProceedings{Anonymous:1994:DCT, author = "Anonymous", title = "On the Design of {Chant}: a Talking Threads Package", crossref = "IEEE:1994:PSW", pages = "350--359", year = "1994", bibdate = "Mon Aug 26 10:38:41 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Anonymous:1994:MDP, author = "Anonymous", title = "{Multiprocessor desktops are proliferating, even though there remains a shortage of multithreaded applications for them}", journal = j-OPEN-SYSTEMS-TODAY, volume = "165", pages = "60--??", month = dec, year = "1994", ISSN = "1061-0839", bibdate = "Fri Jan 26 17:24:01 MST 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Open Systems Today", } @Article{Anonymous:1994:SIP, author = "Anonymous", title = "Special issue: panel sessions of the {1991 Workshop on Multithreaded Computers, November 22, 1991, Albuquerque, New Mexico, in conjunction with Supercomputing '91}", journal = "Computer architecture news", volume = "22", number = "1", pages = "2--33", year = "1994", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Anonymous:1994:WMC, author = "Anonymous", title = "{Wanted: The Multithreaded CIO}", journal = j-DATAMATION, volume = "40", number = "8", pages = "34--??", day = "15", month = apr, year = "1994", CODEN = "DTMNAT", ISSN = "0011-6963", bibdate = "Sat Jan 27 07:35:21 MST 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Technician or business manager? If you want to be a CIO, you better be both. Add to that a host of communications skills and an ability to travel in diverse circles, and you're on your way to being the Multithreaded CIO of the 1990s.", acknowledgement = ack-nhfb, fjournal = "Datamation", } @InProceedings{Baker:1994:EPP, author = "T. P. Baker and Frank Mueller and Viresh Rustagi", title = "Experience with a Prototype of the {POSIX} ``Minimal Realtime System Profile''", crossref = "IEEE:1994:ROS", pages = "12--17", year = "1994", bibdate = "Sat May 25 07:59:58 MDT 1996", bibsource = "Compendex database; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This paper describes experience prototyping the proposed IEEE standard `minimal realtime system profile', whose primary component is support for real-time threads. It provides some background, describes the implementation, and reports preliminary performance measurements.", acknowledgement = ack-nhfb, affiliation = "Florida State Univ", affiliationaddress = "Tallahassee, FL, USA", classification = "722.4; 723.1; 723.1.1; 723.2", conference = "Proceedings of the 11th IEEE Workshop on Real-Time Operating Systems and Software", conferenceyear = "1994", journalabr = "Proc IEEE Workshop Real Time Oper Syst Software", keywords = "Computer operating systems; Computer software portability; Data structures; High level languages; Interfaces (computer); Mesa programming language; Minimal real time system profile; Program processors; Real time systems; Thread; Thread management; Thread priority scheduling", meetingaddress = "Seattle, WA, USA", meetingdate = "May 18--19 1994", meetingdate2 = "05/18--19/94", publisherinfo = "Computer Society Press", sponsor = "IEEE Computer Society", } @Article{Baquero:1994:CAC, author = "Carlos Baquero and Francisco Moura", title = "Concurrency Annotations in {C++}", journal = j-SIGPLAN, volume = "29", number = "7", pages = "61--67", month = jul, year = "1994", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:16:53 MST 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classification = "C6110J (Object-oriented programming); C6110P (Parallel programming); C6140D (High level languages)", corpsource = "DI/INESC, Minho Univ., Portugal", fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "access flag; C language; C++; concurrency annotations; inheritance; inheritance chain; language extension; method code; method invocations; method predicates; multiple threads; object-oriented languages; parallel languages; shared-memory multiprocessor system; synchronisation; synchronization code; synchronization mechanisms", treatment = "P Practical", } @InProceedings{Blumofe:1994:SMC, author = "R. D. Blumofe and C. E. Leiserson", title = "Scheduling multithreaded computations by work stealing", crossref = "Goldwasser:1994:PAS", pages = "356--368", year = "1994", bibdate = "Thu Apr 5 06:13:51 MDT 2001", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @InProceedings{Buendgen:1994:MAT, author = "R. Buendgen and M. Goebel and W. Kuechlin", title = "Multi-Threaded {AC} Term Rewriting", crossref = "Hong:1994:FIS", pages = "84--93", year = "1994", bibdate = "Thu Mar 12 11:28:58 MST 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/issac.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @InProceedings{Buendgen:1994:MTA, author = "R. Buendgen and M. Goebel and W. Kuechlin", title = "Multi-Threaded {AC} Term Rewriting", crossref = "Hong:1994:FIS", pages = "84--93", year = "1994", bibdate = "Thu Mar 12 11:28:58 MST 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/issac.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Buhr:1994:TRM, author = "R. J. A. Buhr and R. S. Casselman", title = "Timethread-Role Maps for Object-Oriented Design of Real-Time-and-Distributed Systems", journal = j-SIGPLAN, volume = "29", number = "10", pages = "301--301", month = oct, year = "1994", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Fri Apr 24 18:36:02 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classification = "C6110J (Object-oriented programming); C6150N (Distributed systems)", conflocation = "Portland, OR, USA; 23-27 Oct. 1994", conftitle = "Ninth Annual Conference on Object-Oriented Programming Systems, Languages, and Applications. OOPSLA '94", corpsource = "Dept. of Syst. and Comput. Eng., Carleton Univ., Ottawa, Ont., Canada", fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "concurrency; distributed processing; distributed systems; dynamic structure; end-to-end responsibility paths; object-oriented approach; object-oriented design; object-oriented design methods; object-oriented methods; object-oriented programming; real-time systems; real-time systems oriented programming; responsibility-driven design; timethread-role maps", sponsororg = "ACM", treatment = "P Practical", } @InProceedings{Bundgen:1994:FPC, author = "Reinhard B{\"u}ndgen and Manfred G{\"o}bel and Wolfgang K{\"u}chlin", title = "A fine-grained parallel completion procedure", crossref = "ACM:1994:IPI", pages = "269--277", year = "1994", bibdate = "Thu Mar 12 08:41:19 MST 1998", bibsource = "http://www.acm.org/pubs/toc/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/proceedings/issac/190347/p269-bundgen/", abstract = "We present a parallel Knuth--Bendix completion algorithm where the inner loop, deriving the consequences of adding a new rule to the system, is multithreaded. The selection of the best new rule in the outer loop, and hence the completion strategy, is exactly the same as for the sequential algorithm. Our implementation, which is within the PARSAC-2 parallel symbolic computation system, exhibits good parallel speedups on a standard multiprocessor workstation.", acknowledgement = ack-nhfb, affiliation = "Wilhelm-Schickard-Inst. fur Inf., Tubingen Univ., Germany", classification = "C4210L (Formal languages and computational linguistics); C4240P (Parallel programming and algorithm theory); C6130 (Data handling techniques); C6150N (Distributed systems software); C7310 (Mathematics computing)", keywords = "algorithms; Fine grained parallel completion procedure; Fine-grained parallel completion procedure; Multithreaded inner loop; Parallel Knuth--Bendix completion algorithm; Parallel speedups; PARSAC-2 parallel symbolic computation system; Standard multiprocessor workstation", subject = "{\bf I.1.2} Computing Methodologies, SYMBOLIC AND ALGEBRAIC MANIPULATION, Algorithms, Algebraic algorithms. {\bf I.1.0} Computing Methodologies, SYMBOLIC AND ALGEBRAIC MANIPULATION, General. {\bf I.1.3} Computing Methodologies, SYMBOLIC AND ALGEBRAIC MANIPULATION, Languages and Systems. {\bf F.4.2} Theory of Computation, MATHEMATICAL LOGIC AND FORMAL LANGUAGES, Grammars and Other Rewriting Systems, Parallel rewriting systems. {\bf F.1.2} Theory of Computation, COMPUTATION BY ABSTRACT DEVICES, Modes of Computation, Parallelism and concurrency.", thesaurus = "Parallel algorithms; Parallel machines; Rewriting systems; Symbol manipulation", } @Article{Carter:1994:HSF, author = "Nicholas P. Carter and Stephen W. Keckler and William J. Dally", title = "Hardware support for fast capability-based addressing", journal = j-SIGPLAN, volume = "29", number = "11", pages = "319--327", month = nov, year = "1994", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:16:57 MST 2003", bibsource = "http://portal.acm.org/; http://www.acm.org/pubs/toc/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/proceedings/asplos/195473/p319-carter/", abstract = "Traditional methods of providing protection in memory systems do so at the cost of increased context switch time and/or increased storage to record access permissions for processes. With the advent of computers that supported cycle-by-cycle multithreading, protection schemes that increase the time to perform a context switch are unacceptable, but protecting unrelated processes from each other is still necessary if such machines are to be used in non-trusting environments. This paper examines {\em guarded pointers\/}, a hardware technique which uses tagged 64-bit pointer objects to implement capability-based addressing. Guarded pointers encode a segment descriptor into the upper bits of every pointer, eliminating the indirection and related performance penalties associated with traditional implementations of capabilities. All processes share a single 54-bit virtual address space, and access is limited to the data that can be referenced through the pointers that a process has been issued. Only one level of address translation is required to perform a memory reference. Sharing data between processes is efficient, and protection states are defined to allow fast protected subsystem calls and create unforgeable data keys.", acknowledgement = ack-nhfb, classification = "C5310 (Storage system design); C6120 (File organisation); C6150N (Distributed systems software)", conflocation = "San Jose, CA, USA; 4-7 Oct. 1994", conftitle = "Sixth International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS-VI)", corpsource = "Lab. for Comput. Sci., MIT, Cambridge, MA, USA", fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "54- bit virtual address space; address translation; capability based addressing; cycle-by-cycle multithreading; design; fast capability-based addressing; fast protected subsystem calls; guarded pointers; hardware support; hardware technique; memory architecture; memory bit virtual address space; memory reference; memory systems; multiprocessing programs; performance; protection schemes; protection states; segment descriptor; storage allocation; tagged 64-bit pointer objects; theory; unforgeable data keys; virtual storage", sponsororg = "ACM; IEEE Comput. Soc", subject = "{\bf C.0} Computer Systems Organization, GENERAL, Instruction set design. {\bf C.4} Computer Systems Organization, PERFORMANCE OF SYSTEMS.", treatment = "P Practical", } @Book{Catanzaro:1994:MSA, author = "Ben J. Catanzaro", title = "Multiprocessor system architectures: a technical survey of multiprocessor\slash multithreaded systems using {SPARC}, multilevel bus architectures and {Solaris} {(SunOS)}", publisher = pub-PHPTR, address = pub-PHPTR:adr, pages = "xxxii + 493", year = "1994", ISBN = "0-13-089137-1", ISBN-13 = "978-0-13-089137-2", LCCN = "QA76.5.C3864 1994", bibdate = "Fri Aug 7 08:29:38 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "computer architecture; multiprocessors; sun computers", } @Article{Chase:1994:SPS, author = "Jeffrey S. Chase and Henry M. Levy and Michael J. Feeley and Edward D. Lazowska", title = "Sharing and Protection in a Single-Address-Space Operating System", journal = j-TOCS, volume = "12", number = "4", pages = "271--307", month = nov, year = "1994", CODEN = "ACSYEC", ISSN = "0734-2071 (print), 1557-7333 (electronic)", ISSN-L = "0734-2071", bibdate = "Wed Jan 13 18:36:53 MST 1999", bibsource = "http://www.acm.org/pubs/contents/journals/tocs/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/journals/tocs/1994-12-4/p271-chase/", abstract = "This article explores memory sharing and protection support in Opal, a single-address-space operating system designed for wide-address (64-bit) architectures. Opal threads execute within protection domains in a single shared virtual address space. Sharing is simplified, because addresses are context independent. There is no loss of protection, because addressability and access are independent; the right to access a segment is determined by the protection domain in which a thread executes. This model enables beneficial code-and data-sharing patterns that are currently prohibitive, due in part to the inherent restrictions of multiple address spaces, and in part to Unix programming style. We have designed and implemented an Opal prototype using the Mach 3.0 microkernel as a base. Our implementation demonstrates how a single-address-space structure can be supported alongside of other environments on a modern microkernel operating system, using modern wide-address architectures. This article justifies the Opal model and its goals for sharing and protection, presents the system and its abstractions, describes the prototype implementation, and reports experience with integrated applications.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Computer Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J774", keywords = "design; experimentation; measurement; performance", subject = "{\bf D.4.2} Software, OPERATING SYSTEMS, Storage Management. {\bf C.1.3} Computer Systems Organization, PROCESSOR ARCHITECTURES, Other Architecture Styles, Capability architectures**. {\bf D.3.3} Software, PROGRAMMING LANGUAGES, Language Constructs and Features, Modules, packages. {\bf D.4.4} Software, OPERATING SYSTEMS, Communications Management. {\bf D.4.6} Software, OPERATING SYSTEMS, Security and Protection, Access controls. {\bf D.4.6} Software, OPERATING SYSTEMS, Security and Protection, Information flow controls. {\bf D.4.7} Software, OPERATING SYSTEMS, Organization and Design. {\bf D.4.8} Software, OPERATING SYSTEMS, Performance, Measurements. {\bf E.1} Data, DATA STRUCTURES. {\bf E.2} Data, DATA STORAGE REPRESENTATIONS.", } @Article{Chaudhry:1994:CMP, author = "Ghulam Chaudhry and Xuechang Li", title = "A case for the multithreaded processor architecture", journal = j-COMP-ARCH-NEWS, volume = "22", number = "4", pages = "55--59", month = sep, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:12 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Dennis:1994:MMP, author = "Jack B. Dennis", title = "Machines and Models for Parallel Computing", journal = j-INT-J-PARALLEL-PROG, volume = "22", number = "1", pages = "47--77", month = feb, year = "1994", CODEN = "IJPPE5", ISSN = "0885-7458 (print), 1573-7640 (electronic)", ISSN-L = "0885-7458", bibdate = "Sat Apr 26 11:04:14 MDT 1997", bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=22&issue=1; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classification = "C5220P (Parallel architecture); C5440 (Multiprocessor systems and techniques); C6110 (Systems analysis and programming); C6150N (Distributed systems)", corpsource = "Lab. for Comput. Sci., MIT, Cambridge, MA, USA", fjournal = "International Journal of Parallel Programming", journal-URL = "http://link.springer.com/journal/10766", keywords = "concurrency control; dataflow principles; functional programming; general semantic model; memory latency; microprocessors; modular software construction; multithreading; parallel computation; parallel computing models; parallel machines; parallel programming; processor architecture; processor design; RISC; shared memory systems; shared-memory model; superpipelined; superscalar; synchronization", treatment = "P Practical", } @Book{Dorfman:1994:EMO, author = "Len Dorfman and Marc J. Neuberger", title = "Effective multithreading in {OS/2}", publisher = pub-MCGRAW-HILL, address = pub-MCGRAW-HILL:adr, pages = "xii + 288", year = "1994", ISBN = "0-07-017841-0 (paperback)", ISBN-13 = "978-0-07-017841-0 (paperback)", LCCN = "QA76.76.O63D6694 1994", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", price = "US\$34.95", acknowledgement = ack-nhfb, annote = "System requirements for computer disk: IBM-compatible PC; 4MB RAM (8MB recommended); OS/2; C compiler such as IBM CSet++ or Borland C++ for OS/2; high-density floppy disk drive; hard disk with 3.1MB free space.", keywords = "Microcomputers -- Operating systems; Operating systems (Computers); OS/2 (Computer file)", } @TechReport{Dubey:1994:APM, author = "Pradeep Dubey and Arvind Krishna and M. J. (Michael J.) Flynn", title = "Analytical performance modeling for a spectrum of multithreaded machines", type = "Research report", number = "RC 19549 (85007)", institution = "IBM T. J. Watson Research Center", address = "Yorktown Heights, NY, USA", pages = "27", day = "3", month = may, year = "1994", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The throughput of pipelined processors suffers due to delays associated with instruction dependencies and memory latencies. Multithreaded architectures try to tolerate such delays by sharing the pipeline with independent instruction threads. This paper proposes a comprehensive analytical framework to quantitate the performance potential of a wide spectrum of multithreaded machines ranging from those that are capable of switching threads every cycle to those that switch threads only on long inter-instruction latencies. For machines in the former category, the proposed analytic model provides an exact solution for pipeline utilization which is significantly better than lower and upper bounds obtainable from simple approximation techniques. Unlike previously published analytic models of such systems, the Markov model developed here accepts a general distribution for the interlock delays with multiple latencies. For machines in the latter category, the paper provides an approximate analytic model which is simpler than previously published analytic models. The models have been verified using previously published analytical and simulation-based results. As compared to the simulation alternative, the models provide a much quicker estimate of pipeline utilization as a function of a number of threads.", acknowledgement = ack-nhfb, keywords = "Computer architecture", } @MastersThesis{Gallagher:1994:PLM, author = "William Lynn Gallagher", title = "Performance limitations of the {MTS} multithreaded architecture", type = "Thesis ({M.S. in Engineering})", school = "University of Texas at Austin", address = "Austin, TX, USA", pages = "xiv + 101", year = "1994", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Gerlhof:1994:MTA, author = "C. A. Gerlhof and A. Kemper", title = "A Multi-Threaded Architecture for Prefetching in Object Bases", journal = j-LECT-NOTES-COMP-SCI, volume = "779", pages = "351--364", year = "1994", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Wed Sep 15 18:44:20 MDT 1999", bibsource = "https://www.math.utah.edu/pub/tex/bib/lncs1994.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", keywords = "database technology; EDBT; extending database technology", } @Article{Gibson:1994:CMC, author = "Ken Gibson", title = "A {C++} Multitasking Class Library", journal = j-DDJ, volume = "19", number = "5", pages = "28, 30, 32, 34, 96--98", month = may, year = "1994", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Tue Sep 03 09:15:49 1996", bibsource = "http://www.ddj.com/index/author/index.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; UnCover database", abstract = "Multithreaded applications that currently execute more than one section of code aren't directly supported by languages such as C++. Ken presents a C++ multitasking class library for MS-DOS that lets you implement a program as a set of concurrent threads.", acknowledgement = ack-nhfb, classification = "C6110J (Object-oriented programming); C6110P (Parallel programming)", fjournal = "Dr. Dobb's Journal of Software Tools", keywords = "C++ multitasking class library; Concurrent execution; DOS; Embedded processors; Interthread communications; Locator program; Microsoft C++ 7.0; Multithreaded applications; Portability; Processor initialization; Queue class; Real-time device control; Real-time executive; ROMable image; Scheduler object; Semaphore class; Simulation; Thread class; Thread synchronization", thesaurus = "C listings; Multiprogramming; Object-oriented programming; Public domain software; Scheduling; Subroutines", } @Article{Giloi:1994:PSA, author = "Wolfgang K. Giloi", title = "Parallel supercomputer architectures and their programming models", journal = j-PARALLEL-COMPUTING, volume = "20", number = "10--11", pages = "1443--1470", day = "3", month = nov, year = "1994", CODEN = "PACOEJ", ISSN = "0167-8191 (print), 1872-7336 (electronic)", ISSN-L = "0167-8191", bibdate = "Fri Aug 6 10:13:51 MDT 1999", bibsource = "http://www.elsevier.com/cgi-bin/cas/tree/store/parco/cas_free/browse/browse.cgi?year=1994&volume=20&issue=10-11; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.elsevier.com/cgi-bin/cas/tree/store/parco/cas_sub/browse/browse.cgi?year=1994&volume=20&issue=10-11&aid=907", acknowledgement = ack-nhfb, classification = "C5220P (Parallel architecture); C5440 (Multiprocessing systems); C5470 (Performance evaluation and testing)", corpsource = "FIRST, GMD Res. Inst. for Comput. Arch. and Software Eng., Berlin, Germany", fjournal = "Parallel Computing", journal-URL = "http://www.sciencedirect.com/science/journal/01678191", keywords = "*T; abstract machine; architectures; DASH; distributed memory; distributed memory systems; distributed shared; hardware architecture; latency hiding; latency minimization; MANNA; memory architectures; message passing; message passing architectures; multi-threaded architectures; parallel; parallel supercomputer architectures; performance; performance evaluation; physically shared memory systems; programming models; scalability; shared memory architectures; shared memory systems; systems; taxonomy; virtual", treatment = "P Practical", } @Manual{Haines:1994:DCT, author = "Matthew Haines and David Cronk and Piyush Mehrotra", title = "On the design of chant: a talking threads of package: final report", number = "194903", publisher = pub-NTIS, address = pub-NTIS:adr, pages = "??", year = "1994", LCCN = "NAS 1.26:194903 Govt Pubs", bibdate = "Fri May 10 12:18:17 MDT 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "Shipping list number 94-0861-M.", series = "NASA contractor report", acknowledgement = ack-nhfb, keywords = "message processing; messages", } @Article{Halstead:1994:PCR, author = "Burt Halstead and David Callahan and Jack Dennis and R. S. Nikhil and Vivek Sarkar", title = "Programming, compilation, and resource management issues for multithreading (panel session {II})", journal = j-COMP-ARCH-NEWS, volume = "22", number = "1", pages = "19--33", month = mar, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:34 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @InProceedings{Holm:1994:CSP, author = "J. Holm and A. Lain and P. Banerjee", title = "Compilation of Scientific Programs into Multithreaded and Message Driven Computation", crossref = "IEEE:1994:PSH", pages = "518--525", year = "1994", bibdate = "Mon Aug 26 10:38:41 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Iannucci:1994:AII, author = "Robert Iannucci and Anant Agarwal and Bill Dally and Anoop Gupta and Greg Papadopoulos and Burton Smith", title = "Architectural and implementation issues for multithreading (panel session {I})", journal = j-COMP-ARCH-NEWS, volume = "22", number = "1", pages = "3--18", month = mar, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:34 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Book{Iannucci:1994:MCA, editor = "Robert A. Iannucci and others", title = "Multithreaded computer architecture: a summary of the state of the art", volume = "SECS 0281", publisher = pub-KLUWER, address = pub-KLUWER:adr, pages = "xvi + 400", year = "1994", ISBN = "0-7923-9477-1", ISBN-13 = "978-0-7923-9477-8", LCCN = "QA76.9.A73 M85 1994", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = "The Kluwer international series in engineering and computer science", acknowledgement = ack-nhfb, keywords = "computer architecture; Computer architecture; Computers -- Design", } @InProceedings{Jeffay:1994:LMT, author = "K. Jeffay", title = "On latency management in time-shared operating systems", crossref = "IEEE:1994:PIW", pages = "86--90", year = "1994", bibdate = "Sat Sep 28 18:52:45 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/mach.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, affiliation = "Dept. of Comput. Sci., North Carolina Univ., Chapel Hill, NC, USA", classification = "C6150J (Operating systems); C6150N (Distributed systems)", keywords = "End-to-end latency; Inter-process communication interconnections; Latency management; Multi-threaded applications; Real-Time Mach kernel; Time-shared operating systems; YARTOS kernel", thesaurus = "Message passing; Operating systems [computers]; Real-time systems; Scheduling; Time-sharing programs", } @Article{Kanalakis:1994:ET, author = "John M. {Kanalakis, Jr.}", title = "Examining {OS/2} 2.1 threads", journal = j-DDJ, volume = "19", number = "1", pages = "74, 76, 78--79, 96", month = jan, year = "1994", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Tue Sep 10 08:52:50 MDT 1996", bibsource = "http://www.ddj.com/index/author/index.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; UnCover database", abstract = "The OS/2 2.1 multitasking model is based on the execution of threads, making it possible for many sections of a single process to execute simultaneously. John examines OS/2's thread architecture, specifically, the scheduling process.", acknowledgement = ack-nhfb, classification = "C6150J (Operating systems)", fjournal = "Dr. Dobb's Journal of Software Tools", keywords = "Bias implementation; OS/2 2.1 multitasking model; Round robin scheduling; Scheduling process; Thread architecture; Threads", thesaurus = "Multiprogramming; Operating systems [computers]; Scheduling", } @Article{Kelly:1994:MBC, author = "Michael Kelly", title = "Multithreading with {OS/2} and {Borland C++}", journal = j-CCCUJ, volume = "12", number = "8", pages = "67--??", month = aug, year = "1994", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Fri Aug 30 16:52:23 MDT 1996", bibsource = "http://www.cuj.com/cbklist.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @Article{Kelly:1994:MOB, author = "Michael Kelly", title = "Multithreading with {OS/2} and {Borland C++}", journal = j-CCCUJ, volume = "12", number = "8", pages = "67--??", month = aug, year = "1994", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Fri Aug 30 16:52:23 MDT 1996", bibsource = "http://www.cuj.com/cbklist.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @PhdThesis{Kim:1994:FPF, author = "Chinhyun Kim", title = "Functional programming and fine-grain multithreading for high-performance parallel computing", type = "Thesis ({Ph.D.})", school = "University of Southern California", address = "Los Angeles, CA, USA", pages = "xv + 150", year = "1994", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Kim:1994:HAM, author = "C. Kim and J.-L. Gaudiot", title = "A Hierarchical Activation Management Technique for Fine-Grain Multithreaded Execution", journal = j-LECT-NOTES-COMP-SCI, volume = "817", pages = "577--??", year = "1994", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Mon May 13 11:52:14 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Krieger:1994:ASF, author = "Orran Krieger and Michael Stumm and Ron Unrau", title = "The {Alloc Stream Facility}: a Redesign of Application-Level Stream {I/O}", journal = j-COMPUTER, volume = "27", number = "3", pages = "75--82", month = mar, year = "1994", CODEN = "CPTRB4", ISSN = "0018-9162 (print), 1558-0814 (electronic)", ISSN-L = "0018-9162", bibdate = "Mon Feb 3 07:28:57 MST 1997", bibsource = "Compendex database; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Many stdio and even Unix I/O applications run faster when linked to the ASF application-level library. Using the Alloc Stream Interface improves performance even more.", acknowledgement = ack-nhfb, affiliation = "Dept. of Electr. and Comput. Eng., Toronto Univ., Ont., Canada", affiliationaddress = "Toronto, Can", classification = "723; C6110J (Object-oriented programming); C6110P (Parallel programming); C6150J (Operating systems)", fjournal = "Computer", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2", journalabr = "Computer", keywords = "Alloc Stream Facility; Alloc stream interface; Application-level I/O facility; Application-level library; Application-level stream I/O; ASF; C stdio library; C++ stream I/O; Computer operating systems; Concurrency; I/O-intensive applications; Input output programs; Mapped files; Multithreaded applications; Object-oriented structure; Parallel applications; Parallel systems; Performance improvements; Popular I/O interfaces; Sequential byte stream; Standard Unix systems; Stdio; System behavior; UNIX", thesaurus = "Input-output programs; Object-oriented methods; Parallel programming; Unix", } @Article{Laudon:1994:IMT, author = "James Laudon and Anoop Gupta and Mark Horowitz", title = "Interleaving: a multithreading technique targeting multiprocessors and workstations", journal = j-SIGPLAN, volume = "29", number = "11", pages = "308--318", month = nov, year = "1994", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:16:57 MST 2003", bibsource = "http://portal.acm.org/; http://www.acm.org/pubs/toc/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "Co-published in {\em Operating Systems Review}, {\bf 28}(5).", URL = "http://www.acm.org:80/pubs/citations/proceedings/asplos/195473/p308-laudon/", abstract = "There is an increasing trend to use commodity microprocessors as the compute engines in large-scale multiprocessors. However, given that the majority of the microprocessors are sold in the workstation market, not in the multiprocessor market, it is only natural that architectural features that benefit only multiprocessors are less likely to be adopted in commodity microprocessors. In this paper, we explore multiple-context processors, an architectural technique proposed to hide the large memory latency in multiprocessors. We show that while current multiple-context designs work reasonably well for multiprocessors, they are ineffective in hiding the much shorter uniprocessor latencies using the limited parallelism found in workstation environments. We propose an alternative design that combines the best features of two existing approaches, and present simulation results that show it yields better performance for both multiprogrammed workloads on a workstation and parallel applications on a multiprocessor. By addressing the needs of the workstation environment, our proposal makes multiple contexts more attractive for commodity microprocessors.", acknowledgement = ack-nhfb, classification = "C5430 (Microcomputers); C5440 (Multiprocessing systems); C6120 (File organisation); C6150J (Operating systems)", conflocation = "San Jose, CA, USA; 4-7 Oct. 1994", conftitle = "Sixth International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS-VI)", corpsource = "Comput. Syst. Lab., Stanford Univ., CA, USA", fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "architectural features; commodity microprocessors; compute engines; design; interleaved storage; interleaving; large memory latency; large-scale multiprocessors; measurement; multiple-context designs; multiple-context processors; multiprocessing systems; multiprogrammed workloads; multiprogramming; multithreading technique; parallel applications; parallel uniprocessor latencies; performance; theory; uniprocessor latencies; workstations", sponsororg = "ACM; IEEE Comput. Soc", subject = "{\bf C.5.3} Computer Systems Organization, COMPUTER SYSTEM IMPLEMENTATION, Microcomputers. {\bf C.4} Computer Systems Organization, PERFORMANCE OF SYSTEMS.", treatment = "P Practical", } @Article{Launchbury:1994:LFS, author = "John Launchbury and Simon L. {Peyton Jones}", title = "Lazy Functional State Threads", journal = j-SIGPLAN, volume = "29", number = "6", pages = "24--35", month = jun, year = "1994", CODEN = "SINODQ", ISBN = "0-89791-598-4", ISBN-13 = "978-0-89791-598-4", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:16:51 MST 2003", bibsource = "http://www.acm.org/pubs/contents/proceedings/pldi/178243/index.html; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/proceedings/pldi/178243/p24-launchbury/", abstract = "Some algorithms make critical internal use of updatable state, even though their external specification is purely functional. Based on earlier work on monads, we present a way of securely encapsulating stateful computations that manipulate multiple, named, mutable objects, in the context of a non-strict, purely-functional language. The security of the encapsulation is assured by the type system, using parametricity. Intriguingly, this parametricity requires the provision of a (single) constant with a rank-2 polymorphic type.", acknowledgement = ack-nhfb, annote = "Published as part of the Proceedings of PLDI'94.", classification = "C4240 (Programming and algorithm theory); C6110 (Systems analysis and programming); C6140D (High level languages)", conflocation = "Orlando, FL, USA; 20-24 June 1994", conftitle = "ACM SIGPLAN '94 Conference on Programming Language Design and Implementation (PLDI)", corpsource = "Glasgow Univ., UK", fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "algorithms; encapsulation; external specification; functional language; functional programming; high level languages; languages; lazy functional state threads; monads; mutable objects; nonstrict purely-functional language; parametricity; rank-2 polymorphic type; security; specification; stateful computations; type system; type theory; updatable state", sponsororg = "ACM", subject = "{\bf D.3.3} Software, PROGRAMMING LANGUAGES, Language Constructs and Features, Procedures, functions, and subroutines. {\bf D.3.2} Software, PROGRAMMING LANGUAGES, Language Classifications, Applicative (functional) languages. {\bf F.3.3} Theory of Computation, LOGICS AND MEANINGS OF PROGRAMS, Studies of Program Constructs, Type structure. {\bf F.4.1} Theory of Computation, MATHEMATICAL LOGIC AND FORMAL LANGUAGES, Mathematical Logic, Lambda calculus and related systems.", treatment = "P Practical; T Theoretical or Mathematical", } @Article{Lee:1994:DAM, author = "Ben Lee and A. R. Hurson", title = "Dataflow Architectures and Multithreading", journal = j-COMPUTER, volume = "27", number = "8", pages = "27--39", month = aug, year = "1994", CODEN = "CPTRB4", ISSN = "0018-9162 (print), 1558-0814 (electronic)", ISSN-L = "0018-9162", bibdate = "Mon Feb 3 07:28:57 MST 1997", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Contrary to initial expectations, implementing dataflow computers has presented a. monumental challenge. Now, however, multithreading offers a. viable alternative for buliding hybrid architectures that exploit parallelism.", acknowledgement = ack-nhfb, affiliation = "Dept. of Electr. and Comput. Eng., Oregon State Univ., Corvallis, OR, USA", classification = "C5220P (Parallel architecture); C5440 (Multiprocessing systems)", fjournal = "Computer", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2", keywords = "Compilers; Concurrency; Data dependencies; Dataflow architectures; Dataflow machines; Functional semantics; Hybrid architectures; Id; Imperative languages; Multithreading; Parallel functional languages; Parallel machines; Parallelism; Programmability; Semantics; Side effects; SISAL; Source code; Streams and Iterations in a Single Assignment Language; Syntax; Threaded Abstract Machine", thesaurus = "Parallel architectures; Parallel processing", } @Article{Liedtke:1994:SNIb, author = "Jochen Liedtke", title = "A short note on implementing thread exclusiveness and address space locking", journal = j-OPER-SYS-REV, volume = "28", number = "3", pages = "38--42", month = jul, year = "1994", CODEN = "OSRED8", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:46 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @MastersThesis{Lu:1994:MPM, author = "David Ta-Chang Lu", title = "A multithreaded processor for massively parallel architectures", type = "Thesis ({M.S.})", school = "University of California, Riverside", address = "Riverside, CA, USA", pages = "vii + 42", year = "1994", LCCN = "QA76.58 .L88 1994", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "computer algorithms; Computer algorithms; computer architecture; Computer architecture; dissertations; dissertations, academic -- UCR -- computer science; parallel computers; Parallel computers; Parallel processing (Electronic computers); parallel processing (electronic computers); Science -- Dissertations; University of California, Riverside. -- Dept. of Computer; University of California, Riverside. Dept. of Computer Science", } @Article{Marinescu:1994:HLC, author = "Dan C. Marinescu and John R. Rice", title = "On High Level Characterization of Parallelism", journal = j-J-PAR-DIST-COMP, volume = "20", number = "1", pages = "107--113", month = jan, year = "1994", CODEN = "JPDCER", DOI = "https://doi.org/10.1006/jpdc.1994.1011", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:18:53 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1994.1011/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1994.1011/production/pdf", acknowledgement = ack-nhfb, classification = "C4240P (Parallel programming and algorithm theory); C5220P (Parallel architecture); C5470 (Performance evaluation and testing)", corpsource = "Dept. of Comput. Sci., Purdue Univ., West Lafayette, IN, USA", fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315", keywords = "communication complexity; load balancing; massively parallel; parallel architectures; parallel execution; parallelism; performance analysis; performance evaluation; speedup; systems; threads of control", treatment = "T Theoretical or Mathematical", } @Book{MixSoftware:1994:UMC, author = "{Mix Software, Inc}", title = "Using {Multi-C}: a portable multithreaded {C} programming library", publisher = pub-PHPTR, address = pub-PHPTR:adr, pages = "vi + 257", year = "1994", ISBN = "0-13-606195-8", ISBN-13 = "978-0-13-606195-3", LCCN = "QA76.73.C15 U85 1994", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "System requirements for computer disk: IBM-compatible PC; DOS; Mix, Borland, or Microsoft-compatible C/C++ compilers.", acknowledgement = ack-nhfb, annote = "System requirements for computer disk: IBM-compatible PC; DOS; Mix, Borland, or Microsoft-compatible C/C++ compilers.", keywords = "C (computer program language); C (Computer program language); Microcomputers -- Programming languages", } @Article{Mukherjee:1994:MII, author = "Bodhisattwa Mukherjee and Greg Eisenhauer and Kaushik Ghosh", title = "A machine independent interface for lightweight threads", journal = j-OPER-SYS-REV, volume = "28", number = "1", pages = "33--47", month = jan, year = "1994", CODEN = "OSRED8", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:36 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @Article{Nemawarkar:1994:PIN, author = "S. S. Nemawarkar and R. Govindarajan and G. R. Gao and V. K. Agarwal", title = "Performance of Interconnection Network in Multithreaded Architectures", journal = j-LECT-NOTES-COMP-SCI, volume = "817", pages = "823--??", year = "1994", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Mon May 13 11:52:14 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Nikhil:1994:MII, author = "Rishiyur S. Nikhil", title = "A Multithreaded Implementation of {Id} using {P-RISC} Graphs", journal = j-LECT-NOTES-COMP-SCI, volume = "768", pages = "390--??", year = "1994", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Mon May 13 11:52:14 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Norwood:1994:SMP, author = "John Norwood and Shankar Vaidyanathan", title = "Symmetric Multiprocessing for {PCs}", journal = j-DDJ, volume = "19", number = "1", pages = "80, 82--85, 98--99", month = jan, year = "1994", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Tue Sep 03 09:15:46 1996", bibsource = "http://www.ddj.com/index/author/index.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; UnCover database", abstract = "Our authors focus on multithreaded application development for single-processor and symmetric-multiprocessor machines under Windows NT. In doing so, they present Fortran interface statements for the Win32 console API and a black-box solution for calling 32-bit DLLs from 16-bit applications under NT.", acknowledgement = ack-nhfb, classification = "C6150J (Operating systems); C6150N (Distributed systems)", fjournal = "Dr. Dobb's Journal of Software Tools", keywords = "16-Bit applications; 32-Bit DLLs; Black-box solution; Fortran interface statements; Multithreaded application; Single processor machines; Symmetric-multiprocessor machines; Win32 console API; Windows NT", thesaurus = "C listings; Multiprocessing programs; Multiprogramming", } @InProceedings{Ramsey:1994:CTB, author = "Norman Ramsey", title = "Correctness of trap-based breakpoint implementations", crossref = "ACM:1994:CRP", pages = "15--24", year = "1994", bibdate = "Mon May 3 12:50:22 MDT 1999", bibsource = "http://www.acm.org/pubs/toc/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/proceedings/plan/174675/p15-ramsey/", abstract = "It is common for debuggers to implement breakpoints by a combination of planting traps and single stepping. When the target program contains multiple threads of execution, a debugger that is not carefully implemented may miss breakpoints. This paper gives a formal model of a breakpoint in a two-threaded program. The model describes correct and incorrect breakpoint implementations. Automatic search of the model's state space shows that the correct implementation does miss a breakpoint. The results apply even to debuggers like dbx and gdb, which are apparently for single-threaded programs; when the user evaluates an expression containing function calls, the debugger executes the call in the target address space, in effect creating a new thread.", acknowledgement = ack-nhfb, keywords = "languages; measurement; theory", subject = "{\bf D.2.5} Software, SOFTWARE ENGINEERING, Testing and Debugging. {\bf F.3.1} Theory of Computation, LOGICS AND MEANINGS OF PROGRAMS, Specifying and Verifying and Reasoning about Programs.", } @Article{Rodley:1994:UIC, author = "John Rodley", title = "{OS/2} and {UnixWare} Interprocess Communication", journal = j-DDJ, volume = "19", number = "5", pages = "78--82, 84, 107--109", month = may, year = "1994", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Tue Sep 03 09:15:49 1996", bibsource = "http://www.ddj.com/index/author/index.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; UnCover database", abstract = "Interprocess communication isn't portable between IBM's OS/2 2.1 and Novell's UnixWare 1.1. But even through the implementation details differ greatly, the two systems do share ways of thinking about IPC. John looks at IPC under OS/2 and UnixWare to see what common ground exists.", acknowledgement = ack-nhfb, classification = "C6150J (Operating systems); C6150N (Distributed systems)", fjournal = "Dr. Dobb's Journal of Software Tools", keywords = "APIs; Applications programming; Functionality; IBM OS/2 2.1; Implementation details; Independent processes; Interprocess communication; IPC models; Multitasking operating systems; Novell UnixWare 1.1; Threads", thesaurus = "C listings; Multiprocessing systems; Operating systems [computers]; Unix", } @InProceedings{Shee:1994:DMA, author = "Jang Chung Shee and Chao Chin Wu and Lin Wen You and Cheng Chen", title = "Design of a multithread architecture and its parallel simulation and evaluation environment", crossref = "Anonymous:1994:ICS", pages = "69--76 (vol. 1)", year = "1994", bibdate = "Sun Dec 22 10:19:23 MST 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, affiliation = "Inst. of Comput. Sci. and Inf. Eng., Nat. Chiao Tung Univ., Hsinchu, Taiwan", classification = "C5220P (Parallel architecture); C6115 (Programming support); C6185 (Simulation techniques)", keywords = "Context switch; Integrated multiprocessing simulation environment; Multithread architecture; Parallel simulation; Parallel simulation and evaluation environment; Parallel Virtual Machine; SUN SPARC workstations; Thread-related instructions", thesaurus = "Digital simulation; Parallel architectures; Programming environments", } @InProceedings{Spero:1994:MMD, author = "Simon E. Spero", title = "{MDMA} --- Multithreaded Daemon for Multimedia Access", crossref = "Anonymous:1994:PIW", pages = "??--??", year = "1994", bibdate = "Mon Oct 23 09:15:37 2000", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @TechReport{Squillante:1994:AMP, author = "Mark S. Squillante", title = "Analytic modeling of processor utilization in multithreaded processor architectures", type = "Research report", number = "RC 19543 (84999)", institution = "IBM T. J. Watson Research Center", address = "Yorktown Heights, NY, USA", pages = "9", month = apr, year = "1994", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "In this paper, we develop an analytic model of processor utilization in multithreaded processor architectures that supports both serial and parallel processing of memory requests. The system is modeled as a finite, continuous-time Markov chain whose solution can be obtained efficiently. Although it applies more generally, our modeling approach supports an important class of probability distributions that can be used to approximate the distributions of interest with sufficient accuracy in most practical cases. This results in an efficient and accurate model across a wide variety of system environments.", acknowledgement = ack-nhfb, keywords = "Multiprocessors", } @Article{Tetewsky:1994:GDR, author = "Avram K. Tetewsky", title = "{GUI} Development for Real-Time Applications", journal = j-DDJ, volume = "19", number = "6", pages = "28, 30, 32, 36, 38, 40--41", month = jun, year = "1994", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Tue Sep 03 09:15:49 1996", bibsource = "http://www.ddj.com/index/author/index.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; UnCover database", abstract = "Although they take radically different approaches, both ControlCalc and LabView are designed for building GUI-based, real-time control applications.", acknowledgement = ack-nhfb, affiliation = "Draper (C.S.) Lab., Cambridge, MA, USA", classification = "C6115 (Programming support); C6130B (Graphics techniques); C6180G (Graphical user interfaces); C7420 (Control engineering)", fjournal = "Dr. Dobb's Journal of Software Tools", keywords = "386/OS-9000; 680X0/OS9; ControlCalc Version 1.78; G-Windows 2.3 windowing package; GUI development; LabView 3.0; Multipage-spreadsheet paradigm; Multithreaded program; National Instruments; OS-9000 1.3; PC-based tools; Rapid prototyping; Real-time control application; RTWare; Windows data-flow driven software", thesaurus = "Computerised control; Graphical user interfaces; Real-time systems; Software tools", } @Article{Thekkath:1994:EMH, author = "Radhika Thekkath and Susan J. Eggers", title = "The effectiveness of multiple hardware contexts", journal = j-SIGPLAN, volume = "29", number = "11", pages = "328--337", month = nov, year = "1994", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:16:57 MST 2003", bibsource = "http://portal.acm.org/; http://www.acm.org/pubs/toc/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/proceedings/asplos/195473/p328-thekkath/", abstract = "Multithreaded processors are used to tolerate long memory latencies. By executing threads loaded in multiple hardware contexts, an otherwise idle processor can keep busy, thus increasing its utilization. However, the larger size of a multi-thread working set can have a negative effect on cache conflict misses. In this paper we evaluate the two phenomena together, examining their combined effect on execution time. The usefulness of multiple hardware contexts depends on: program data locality, cache organization and degree of multiprocessing. Multiple hardware contexts are most effective on programs that have been optimized for data locality. For these programs, execution time dropped with increasing contexts, over widely varying architectures. With unoptimized applications, multiple contexts had limited value. The best performance was seen with only two contexts, and only on uniprocessors and small multiprocessors. The behavior of the unoptimized applications changed more noticeably with variations in cache associativity and cache hierarchy, unlike the optimized programs. As a mechanism for exploiting program parallelism, an additional processor is clearly better than another context. However, there were many configurations for which the addition of a few hardware contexts brought as much or greater performance than a larger multiprocessor with fewer than the optimal number of contexts.", acknowledgement = ack-nhfb, classification = "C5320G (Semiconductor storage); C5440 (Multiprocessing systems); C6110P (Parallel programming); C6120 (File organisation); C6150N (Distributed systems software)", conflocation = "San Jose, CA, USA; 4-7 Oct. 1994", conftitle = "Sixth International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS-VI)", corpsource = "Dept. of Comput. Sci. and Eng., Washington Univ., Seattle, WA, USA", fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "cache associativity; cache conflict misses; cache hierarchy; cache organization; cache storage; data locality; design; long; long memory latencies; measurement; multi-thread working set; multiple hardware contexts; multiprocessing; multiprocessing systems; multithreaded processors; parallel programming; performance; program data locality; program parallelism; storage management; theory; unoptimized applications", sponsororg = "ACM; IEEE Comput. Soc", subject = "{\bf C.5.3} Computer Systems Organization, COMPUTER SYSTEM IMPLEMENTATION, Microcomputers. {\bf C.4} Computer Systems Organization, PERFORMANCE OF SYSTEMS.", treatment = "P Practical", } @Article{Thekkath:1994:ISB, author = "R. Thekkath and S. J. Eggers", title = "Impact of sharing-based thread placement on multithreaded architectures", journal = j-COMP-ARCH-NEWS, volume = "22", number = "2", pages = "176--186", month = apr, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @MastersThesis{Wang:1994:MAD, author = "Xiaobao Wang", title = "Multithreaded architecture: design and performance analysis", volume = "3016", type = "Thesis ({M. S.})", school = "Department of Electrical Engineering, University of Hawaii at Manoa", address = "Manoa, HI, USA", pages = "59", year = "1994", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = "Theses for the degree of Master of Science (University of Hawaii at Manoa)", acknowledgement = ack-nhfb, keywords = "Computer architecture; Multiprocessors", } @Article{Williams:1994:NST, author = "Al Williams", title = "{NT-Style} Threads For {MS-DOS}", journal = j-DDJ, volume = "19", number = "2", pages = "74, 76--77", month = feb, year = "1994", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Tue Sep 03 09:15:47 1996", bibsource = "http://www.ddj.com/index/author/index.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; UnCover database", abstract = "Al uses Phar Lap's TNT 386/DOS-Extender to implement NT-style threads in a DOS program that removes a directory tree. Instead of recursing down the tree, the program (which works with NT and TNT) processes directories in parallel.", acknowledgement = ack-nhfb, classification = "C6110 (Systems analysis and programming); C6150C (Compilers, interpreters and other processors); C6150J (Operating systems)", fjournal = "Dr. Dobb's Journal of Software Tools", keywords = "BIOS interrupts; C library functions; Compiling; DOS; Memory allocation; MS-DOS; Multiple threads; Multithreading; Phar Lap; Specification; TNT 386/DOS-Extender; Win32 programming API; Win32-base API; Windows; Windows NT", thesaurus = "Interrupts; Multiprogramming; Operating systems [computers]; Program compilers", } @Article{Williams:1994:NTM, author = "Al Williams", title = "{NT-Style} Threads For {MS-DOS}", journal = j-DDJ, volume = "19", number = "2", pages = "74, 76--77", month = feb, year = "1994", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Tue Sep 03 09:15:47 1996", bibsource = "http://www.ddj.com/index/author/index.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; UnCover database", abstract = "Al uses Phar Lap's TNT 386/DOS-Extender to implement NT-style threads in a DOS program that removes a directory tree. Instead of recursing down the tree, the program (which works with NT and TNT) processes directories in parallel.", acknowledgement = ack-nhfb, classification = "C6110 (Systems analysis and programming); C6150C (Compilers, interpreters and other processors); C6150J (Operating systems)", fjournal = "Dr. Dobb's Journal of Software Tools", keywords = "BIOS interrupts; C library functions; Compiling; DOS; Memory allocation; MS-DOS; Multiple threads; Multithreading; Phar Lap; Specification; TNT 386/DOS-Extender; Win32 programming API; Win32-base API; Windows; Windows NT", thesaurus = "Interrupts; Multiprogramming; Operating systems [computers]; Program compilers", } @Article{Wong:1994:SSI, author = "W. F. Wong and E. Goto", title = "A Simulation Study on the Interactions Between Multithreaded Architectures and the Cache", journal = j-INT-J-HIGH-SPEED-COMPUTING, volume = "6", number = "2", pages = "343--??", year = "1994", CODEN = "IHSCEZ", ISSN = "0129-0533", bibdate = "Mon Feb 25 11:19:24 MST 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; OCLC Article1st database", acknowledgement = ack-nhfb, fjournal = "International Journal of High Speed Computing (IJHSC)", } @Article{Anonymous:1995:HUW, author = "Anonymous", title = "{HP-UX 10.0 will be unveiled this week, with newly tuned kernel and I\slash {O} paths, plus a multithreaded NFS implementation}", journal = j-OPEN-SYSTEMS-TODAY, volume = "168", pages = "34--??", month = feb, year = "1995", ISSN = "1061-0839", bibdate = "Fri Jan 26 17:24:01 MST 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Open Systems Today", } @Article{Anonymous:1995:HWB, author = "Anonymous", title = "{HP-UX 10.0 will be unveiled this week, with newly tuned kernel and I\slash {O} paths, plus a multithreaded NFS implementation}", journal = j-OPEN-SYSTEMS-TODAY, volume = "168", pages = "34--??", month = feb, year = "1995", ISSN = "1061-0839", bibdate = "Fri Jan 26 17:24:01 MST 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Open Systems Today", } @Article{Baker:1995:GTP, author = "Mary Baker", title = "Going threadbare (panel session): sense or sedition? a debate on the threads abstraction", journal = j-OPER-SYS-REV, volume = "29", number = "5", pages = "227--227", month = dec, year = "1995", CODEN = "OSRED8", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:55 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @Article{Baker:1995:UOV, author = "Henry G. Baker", title = "``Use-once'' variables and linear objects: storage management, reflection and multi-threading", journal = j-SIGPLAN, volume = "30", number = "1", pages = "45--52", month = jan, year = "1995", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:16:59 MST 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan1990.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Banerjee:1995:PCD, author = "Prithviraj Banerjee and John A. Chandy and Manish Gupta and Eugene W. {Hodges IV} and John G. Holm and Antonio Lain and Daniel J. Palermo and Shankar Ramaswamy and Ernesto Su", title = "The {Paradigm} compiler for distributed-memory multicomputers", journal = j-COMPUTER, volume = "28", number = "10", pages = "37--47", month = oct, year = "1995", CODEN = "CPTRB4", ISSN = "0018-9162 (print), 1558-0814 (electronic)", ISSN-L = "0018-9162", bibdate = "Mon Feb 3 07:21:26 MST 1997", bibsource = "Compendex database; https://www.math.utah.edu/pub/tex/bib/computer1990.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, affiliation = "Illinois Univ., Urbana, IL, USA", affiliationaddress = "Urbana-Champaign, IL, USA", classification = "722.3; 722.4; 723.1; 723.2; C6110P (Parallel programming); C6150C (Compilers, interpreters and other processors); C6150N (Distributed systems software)", fjournal = "Computer", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2", journalabr = "Computer", keywords = "Address space; Automatic parallelization; Codes (symbols); Computational methods; Computer hardware; Computer programming; Data communication systems; Data parallelism; Data partitioning; Data processing; Distributed memory multicomputer; Distributed-memory multicomputers; Efficient software; Explicitly managed communication; Functional parallelism; Irregular computations; Manually distribution; Massively parallel computers; Multithreading; Paradigm compiler; Parallel algorithms; Parallel processing systems; Parallel programming; Program compilers; Regular computations; Sequential programs; Supercomputers", thesaurus = "Distributed memory systems; Parallel machines; Parallel programming; Parallelising compilers; Program compilers", } @Book{Bic:1995:ATD, author = "Lubomir Bic and Guang R. Gao and Jean-Luc Gaudiot", title = "Advanced topics in dataflow computing and multithreading", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "x + 450", year = "1995", ISBN = "0-8186-6541-6, 0-8186-6540-8 (paperback)", ISBN-13 = "978-0-8186-6541-7, 978-0-8186-6540-0 (paperback)", LCCN = "QA76.9.A73A356 1994", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Computer architecture; Data structures (Computer science); Parallel processing (Electronic computers)", } @Article{Blumofe:1995:CEM, author = "Robert D. Blumofe and Christopher F. Joerg and Bradley C. Kuszmaul and Charles E. Leiserson and Keith H. Randall and Yuli Zhou", title = "{Cilk}: an efficient multithreaded runtime system", journal = j-SIGPLAN, volume = "30", number = "8", pages = "207--216", month = aug, year = "1995", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:17:08 MST 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Cilk (pronounced `silk') is a C-based runtime system for multithreaded parallel programming. In this paper, we document the efficiency of the Cilk work-stealing scheduler, both empirically and analytically. We show that on real and synthetic applications, the `work' and `critical path' of a Cilk computation can be used to accurately model performance. Consequently, a Cilk programmer can focus on reducing the work and critical path of his computation, insulated from load balancing and other runtime scheduling issues. We also prove that for the class of `fully strict' (well-structured) programs, the Cilk scheduler achieves space, time, and communication bounds all within a constant factor of optimal. The Cilk runtime system currently runs on the Connection Machine CM5 massively parallel processor (MPP), the Intel Paragon MPP, the Silicon Graphics Power Challenge symmetric multiprocessor (SMP), and the MIT Phish network of workstations. Applications written in Cilk include protein folding, graphic rendering, backtrack searching, and the *Socrates chess program, which won third prize in the 1994 ACM International Computer Chess Championship.", acknowledgement = ack-nhfb, affiliation = "Lab. for Comput. Sci., MIT, Cambridge, MA, USA", classification = "C6110P (Parallel programming); C6150C (Compilers, interpreters and other processors); C6150N (Distributed systems software)", fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "*Socrates chess program; Accurate performance modelling; Backtrack searching; C-based multithreaded runtime system; Cilk; Communication bounds; Connection Machine CM5; Critical path; Efficiency; Fully strict programs; Graphic rendering; Intel Paragon; Load balancing; MIT Phish workstation network; Parallel programming; Protein folding; Runtime scheduling issues; Silicon Graphics Power Challenge; Space bounds; Time bounds; Well-structured programs; Work-stealing scheduler", thesaurus = "Backtracking; Biology computing; Molecular configurations; Parallel programming; Processor scheduling; Program interpreters; Proteins; Rendering [computer graphics]", } @PhdThesis{Blumofe:1995:EMP, author = "Robert D. (Robert David) Blumofe", title = "Executing multithreaded programs efficiently", type = "Thesis ({Ph.D.})", school = "Massachusetts Institute of Technology, Department of Electrical Engineering and Computer Science", address = "Cambridge, MA, USA", pages = "145", year = "1995", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @InProceedings{Bubeck:1995:DSC, author = "T. Bubeck and M. Hiller and W. Kuchlin and W. Rosenstiel", title = "Distributed symbolic computation with {DTS}", crossref = "Ferreira:1995:PAI", pages = "231--248", year = "1995", bibdate = "Sun Dec 22 10:19:23 MST 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography1990.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/pvm.bib", acknowledgement = ack-nhfb, affiliation = "Wilhelm-Schickard-Inst. fur Inf., Tubingen Univ., Germany", classification = "C4130 (Interpolation and function approximation); C4240P (Parallel programming and algorithm theory); C6110P (Parallel programming); C6115 (Programming support); C6130S (Data security); C6150N (Distributed systems software)", keywords = "Anonymous compute servers; Asynchronous RPC abstraction; C threads interface; Cryptosystem; Distributed symbolic computation; Distributed threads system; DTS; Fork/join parallel programming; Highly data-dependent algorithm parallelisation; Irregular algorithm parallelisation; Multiprocessor workstation; Multithreading; Parallel long integer multiplication; Parallel multi-variate polynomial resultant computation; Performance results; Programming environment; PVM; Shared memory threads", thesaurus = "Arithmetic; Cryptography; Distributed memory systems; Multiprocessing programs; Multiprocessing systems; Parallel algorithms; Parallel programming; Polynomials; Programming environments; Remote procedure calls; Shared memory systems; Software performance evaluation; Symbol manipulation; Workstations", } @Article{Byrd:1995:MPA, author = "G. T. Byrd and M. A. Holliday", title = "Multithreaded processor architectures", journal = j-IEEE-SPECTRUM, volume = "32", number = "8", pages = "38--46", month = aug, year = "1995", CODEN = "IEESAM", DOI = "https://doi.org/10.1109/6.402166", ISSN = "0018-9235 (print), 1939-9340 (electronic)", ISSN-L = "0018-9235", bibdate = "Thu Jan 16 07:37:23 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeespectrum1990.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Spectrum", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=6", keywords = "Application software; Computer architecture; computer architecture; Delay; Hardware; High performance computing; idle cycles; instruction streams; Job shop scheduling; Large-scale systems; latency; microprocessor chips; multiple concurrent execution streams; multiprogramming; multithreaded processor architectures; performance; Registers; single processor; Supercomputers; time-consuming operation", } @Article{Caudal:1995:DEM, author = "F. Caudal and B. Lecussan", title = "Design and Evaluation of a Multi-Threaded Architecture for Parallel Graph Reduction", journal = j-LECT-NOTES-COMP-SCI, volume = "964", pages = "411--??", year = "1995", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Sat May 11 13:45:32 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Cejtin:1995:HOD, author = "Henry Cejtin and Suresh Jagannathan and Richard Kelsey", title = "Higher-Order Distributed Objects", journal = j-TOPLAS, volume = "17", number = "5", pages = "704--739", month = sep, year = "1995", CODEN = "ATPSDT", ISSN = "0164-0925 (print), 1558-4593 (electronic)", ISSN-L = "0164-0925", bibdate = "Fri Jan 5 07:58:42 MST 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org/pubs/toc/Abstracts/0164-0925/213986.html", abstract = "We describe a distributed implementation of Scheme that permits efficient transmission of higher-order objects such as closures and continuations. The integration of distributed communication facilities within a higher-order programming language engenders a number of new abstractions and paradigms for distributed computing. Among these are user-specified load-balancing and migration policies for threads, incrementally linked distributed computations, and parameterized client-server applications. To our knowledge, this is the first distributed dialect of Scheme (or a related language) that addresses lightweight communication abstractions for higher-order objects.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Programming Languages and Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783", keywords = "experimentation; languages", subject = "{\bf D.1.3}: Software, PROGRAMMING TECHNIQUES, Concurrent Programming, Distributed programming. {\bf D.3.2}: Software, PROGRAMMING LANGUAGES, Language Classifications, Applicative languages. {\bf D.3.2}: Software, PROGRAMMING LANGUAGES, Language Classifications, Extensible languages. {\bf D.3.3}: Software, PROGRAMMING LANGUAGES, Language Constructs and Features, Concurrent programming structures. {\bf D.3.2}: Software, PROGRAMMING LANGUAGES, Language Classifications, SCHEME.", } @Article{Chang:1995:CSM, author = "C.-Y. Chang and J.-P. Sheu", title = "Compile-time scheduling of multithread with data localities on multiple vector processors", journal = j-CPE, volume = "7", number = "5", pages = "349--369", month = aug, year = "1995", CODEN = "CPEXEI", ISSN = "1040-3108", ISSN-L = "1040-3108", bibdate = "Tue Sep 7 05:40:19 MDT 1999", bibsource = "https://www.math.utah.edu/pub/tex/bib/cpe.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Concurrency, practice and experience", } @Article{Chang:1995:CTS, author = "C.-Y. Chang and J.-P. Sheu", title = "Compile-time scheduling of multithread with data localities on multiple vector processors", journal = j-CPE, volume = "7", number = "5", pages = "349--369", month = aug, year = "1995", CODEN = "CPEXEI", ISSN = "1040-3108", ISSN-L = "1040-3108", bibdate = "Tue Sep 7 05:40:19 MDT 1999", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Concurrency, practice and experience", } @Article{Chong:1995:PAF, author = "Yong-Kim Chong and Kai Hwang", title = "Performance Analysis of Four Memory Consistency Models for Multithreaded Multiprocessors", journal = j-IEEE-TRANS-PAR-DIST-SYS, volume = "6", number = "10", pages = "1085--1099", month = oct, year = "1995", CODEN = "ITDSEO", DOI = "https://doi.org/10.1109/71.473517", ISSN = "1045-9219 (print), 1558-2183 (electronic)", ISSN-L = "1045-9219", bibdate = "Fri Nov 6 12:31:15 MST 1998", bibsource = "Compendex database; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.computer.org/tpds/td1995/l1085abs.htm", acknowledgement = ack-nhfb, affiliation = "Nanyang Technological Univ", affiliationaddress = "Singapore, Singapore", classification = "716.1; 722.1; 722.3; 722.4; 921.4; 922.1; C1160 (Combinatorial mathematics); C5440 (Multiprocessing systems); C5470 (Performance evaluation and testing)", corpsource = "Sch. of Electr. and Electron. Eng., Nanyang Technol. Univ., Singapore", fjournal = "IEEE Transactions on Parallel and Distributed Systems", journal-URL = "http://www.computer.org/tpds/archives.htm", journalabr = "IEEE Trans Parallel Distrib Syst", keywords = "attributes; Bandwidth; Buffer storage; cache interferences; Computer networks; Computer selection and evaluation; Computer simulation; Context switching; Data communication systems; Data storage equipment; Distributed shared memory; distributed shared memory models; embedded Markov chains; evaluation; Latency hiding techniques; Markov processes; memory consistency models; Memory consistency models; memory event reordering; multiprocessing systems; Multiprocessing systems; multithreaded multiprocessors; Multithreaded multiprocessors; performance; Performance; performance analysis; Performance evaluation; Petri net models; Petri nets; Processors; rate; scalable multiprocessors; Scalable multiprocessors; stochastic timed Petri nets; Stochastic timed Petri nets; synchronisation; synchronization; Synchronization; Telecommunication traffic; write buffers", treatment = "A Application; P Practical", } @TechReport{Chrisochoides:1995:MMDa, author = "Nikos Chrisochoides", title = "Multithreaded model for dynamic load balancing parallel adaptive {PDE} computations", type = "Technical report", number = "CTC95, TR221", institution = "Cornell Theory Center, Cornell University", address = "Ithaca, NY, USA", pages = "23", year = "1995", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, alttitle = "Multi-threaded model for dynamic load balancing parallel adaptive PDE computations", } @TechReport{Chrisochoides:1995:MMDb, author = "Nikos Chrisochoides", title = "Multithreaded model for dynamic load balancing parallel adaptive {PDE} computations", type = "{NASA} contractor report 198244; {ICASE} report 95-83.", institution = "Institute for Computer Applications in Science and Engineering NASA Langley Research Center", address = "Hampton, VA, USA", pages = "i + 23 + i", month = nov, year = "1995", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "To appear in Applied Numerical Mathematics Journal.", abstract = "We present a multithreaded model for the dynamic load-balancing of numerical, adaptive computations required for the solution of Partial Differential Equations (PDEs) on multiprocessors. Multithreading is used as a means of exploring concurrency at the processor level in order to tolerate synchronization costs inherent to traditional (non-threaded) parallel adaptive PDE solvers. Our preliminary analysis for parallel, adaptive PDE solvers indicates that multithreading can be used as a mechanism to mask overheads required for the dynamic balancing of processor workloads with computations required for the actual numerical solution of the PDEs. Also, multithreading can simplify the implementation of dynamic load-balancing algorithms, a task that is very difficult for traditional data parallel adaptive PDE computations. Unfortunately, multithreading does not always simplify program complexity, often makes code re-usability difficult, and increases software complexity.", acknowledgement = ack-nhfb, annote = "Supported in part by an Alex Nason Prize Award Supported in part by the NSF, supplemented by ARPA. Supported in part by the National Aeronautics and Space Administration.", keywords = "Differential equations, Partial; Parallel programming (Computer science); Synchronization; Threads (Computer programs)", } @Article{Coorg:1995:PNS, author = "S. R. Coorg", title = "Partitioning Non-Strict Functional Languages for Multi-Threaded Code Generation", journal = j-LECT-NOTES-COMP-SCI, volume = "983", pages = "82--??", year = "1995", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Sat May 11 13:45:32 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @MastersThesis{Divekar:1995:IMP, author = "Ravindra Divekar", title = "The impact of multithreading on the performance of superscalar processors", type = "Thesis ({M.A.})", number = "2117", school = "State University of New York at Binghamton, Thomas J. Watson School of Engineering and Applied Science", address = "Binghamton, NY, USA", pages = "vi + 73", year = "1995", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = "Master's theses / State University of New York at Binghamton", acknowledgement = ack-nhfb, keywords = "Operating systems (Computers)", } @Article{Dorojevets:1995:MDA, author = "M. N. Dorojevets and V. G. Oklobdzija", title = "Multithreaded Decoupled Architecture", journal = j-INT-J-HIGH-SPEED-COMPUTING, volume = "7", number = "3", pages = "465--??", year = "1995", CODEN = "IHSCEZ", ISSN = "0129-0533", bibdate = "Mon Feb 25 11:19:23 MST 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; OCLC Article1st database", acknowledgement = ack-nhfb, fjournal = "International Journal of High Speed Computing (IJHSC)", } @Article{Drusinsky:1995:VDE, author = "Doron Drusinsky", title = "Visually Designing Embedded-Systems Applications", journal = j-DDJ, volume = "20", number = "6", pages = "62, 64, 66, 68, 104--106", month = jun, year = "1995", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Thu Jan 9 09:35:43 MST 1997", bibsource = "Compendex database; http://www.ddj.com/index/author/index.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; UnCover database", abstract = "Doron describes how design tools that incorporate object-oriented inheritance and extended state diagrams (the visual counterpart of finite state machines) can be used to build control systems.", acknowledgement = ack-nhfb, affiliation = "R-Active Concepts and Co-Active Concepts, Ltd", classification = "721.1; 722.4; 723.1; 723.1.1; 723.2; 723.5; C5140 (Firmware); C6110J (Object-oriented programming); C6110P (Parallel programming); C6140D (High level languages)", fjournal = "Dr. Dobb's Journal of Software Tools", journalabr = "Dr Dobb's J Software Tools Prof Program", keywords = "C; C (programming language); C++ listing; Codes (SYMBOLS); Computer aided software engineering; Computer software; Computer systems; Concurrency; Digital answering machine; Embedded systems; Embedded-systems application; ESD; Extended state diagram; Extended state diagrams; Finite automata; Finite state diagram; Firmware; Hierarchy; Inheritance; Interactive computer systems; Microcode; Multithreading; Object oriented programming; Operating-system-like routine; Reactive system; Real time system; State diagram; Synchronization; Systems analysis; Visual synchronization; Visually designing", pagecount = "4", thesaurus = "C language; C listings; Firmware; Object-oriented programming; Real-time systems", } @TechReport{Dubey:1995:SSM, author = "Pradeep Dubey", title = "Single-program speculative multithreading ({SPSM}) architecture: compiler-assisted fine-grained multithreading", type = "Research report", number = "RC 19928 (88233)", institution = "IBM T. J. Watson Research Center", address = "Yorktown Heights, NY, USA", pages = "25", day = "6", month = feb, year = "1995", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Recent limit studies on instruction-level parallel processing, based on non-numeric applications, have reported significant performance gains from speculative execution of multiple control flows. This paper describes a new single-program speculative multithreading (SPSM) architecture, which can be viewed as an extension of any existing single-thread architecture. It enables speculative fetch, decode, and execution from multiple program locations simultaneously. Instruction threads are generated at compile-time using control dependence analysis. Inter-thread data dependences are also analyzed at compile-time. However, resource binding of instructions is performed only at run time, to offer binary compatibility across different implementations. New thread generation algorithms, being prototyped in a version of the TOBEY compiler, are also described. The SPSM architecture includes novel fork/suspend instructions which are used to identify independent instruction threads, and also to specify compile-time control flow speculations associated with inter-thread dependences.", acknowledgement = ack-nhfb, keywords = "Computer architecture", } @Article{Dugger:1995:MC, author = "Jim Dugger", title = "Multithreading in {C++}", journal = j-CCCUJ, volume = "13", number = "11", pages = "23--??", month = nov, year = "1995", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Fri Aug 30 16:52:23 MDT 1996", bibsource = "http://www.cuj.com/cbklist.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @InProceedings{Elmasri:1995:TCL, author = "N. Elmasri and H. H. J. Hum and G. R. Gao", title = "The Threaded Communication Library: Preliminary Experiences on a Multiprocessor with Dual-Processor Nodes", crossref = "ACM:1995:CPI", pages = "195--199", year = "1995", bibdate = "Mon Aug 26 10:38:41 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{English:1995:MC, author = "John English", title = "Multithreading in {C++}", journal = j-SIGPLAN, volume = "30", number = "4", pages = "21--28", month = apr, year = "1995", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:17:03 MST 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Manual{Fahringer:1995:UTDa, author = "Thomas Fahringer and Matthew Haines and Piyush Mehrotra", title = "On the utility of threads for data parallel programming", number = "198155", publisher = pub-NTIS, address = pub-NTIS:adr, pages = "??", year = "1995", LCCN = "NAS 1.26:198155 Govt Pubs", bibdate = "Fri May 10 12:18:17 MDT 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "Shipping list number 96-0037-M", series = "NASA contractor report", acknowledgement = ack-nhfb, keywords = "computation; interprocessor communication; parallel programming; particle in cell technique; relaxation method (mathematics)", } @InProceedings{Fahringer:1995:UTDb, author = "T. Fahringer and M. Haines and P. Mehrotra", title = "On the Utility of Threads for Data Parallel Programming", crossref = "ACM:1995:CPI", pages = "51--59", year = "1995", bibdate = "Mon Aug 26 10:38:41 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @InProceedings{Field:1995:PPS, author = "John Field and G. Ramalingam and Frank Tip", title = "Parametric program slicing", crossref = "ACM:1995:CRP", pages = "379--392", year = "1995", bibdate = "Mon May 3 12:52:30 MDT 1999", bibsource = "http://www.acm.org/pubs/toc/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/proceedings/plan/199448/p379-field/", abstract = "Program slicing is a technique for isolating computational threads in programs. In this paper, we show how to mechanically extract a family of practical algorithms for computing slices directly from semantic specifications. These algorithms are based on combining the notion of {\em dynamic dependence tracking\/} in term rewriting systems with a program representation whose behavior is defined via an equational logic. Our approach is distinguished by the fact that changes to the behavior of the slicing algorithm can be accomplished through simple changes in rewriting rules that define the semantics of the program representation. Thus, e.g., different notions of dependence may be specified, properties of language-specific datatypes can be exploited, and various time, space, and precision tradeoffs may be made. This flexibility enables us to generalize the traditional notions of static and dynamic slices to that of a {\em constrained\/} slice, where any subset of the inputs of a program may be supplied.", acknowledgement = ack-nhfb, keywords = "algorithms; languages", subject = "{\bf F.3.3} Theory of Computation, LOGICS AND MEANINGS OF PROGRAMS, Studies of Program Constructs, Program and recursion schemes. {\bf F.3.3} Theory of Computation, LOGICS AND MEANINGS OF PROGRAMS, Studies of Program Constructs, Functional constructs. {\bf F.3.2} Theory of Computation, LOGICS AND MEANINGS OF PROGRAMS, Semantics of Programming Languages. {\bf F.3.1} Theory of Computation, LOGICS AND MEANINGS OF PROGRAMS, Specifying and Verifying and Reasoning about Programs, Specification techniques. {\bf F.4.2} Theory of Computation, MATHEMATICAL LOGIC AND FORMAL LANGUAGES, Grammars and Other Rewriting Systems. {\bf D.3.2} Software, PROGRAMMING LANGUAGES, Language Classifications, C.", } @Article{Finger:1995:LTC, author = "Jonathan Finger", title = "Lightweight Tasks in {C}", journal = j-DDJ, volume = "20", number = "5", pages = "48, 50, 102", month = may, year = "1995", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Tue Sep 03 09:16:50 1996", bibsource = "Compendex database; http://www.ddj.com/index/author/index.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; UnCover database", abstract = "While most modern operating systems allow multiple threads within a process, earlier-generation systems do not. Jonathan presents a multithreading package that allows for cooperatively multitasked threads within a single process for operating systems that do not explicitly support threads.", acknowledgement = ack-nhfb, classification = "722.4; 723.1; 723.1.1; C6110B (Software engineering techniques); C6150J (Operating systems)", fjournal = "Dr. Dobb's Journal of Software Tools", journalabr = "Dr Dobb's J Software Tools Prof Program", keywords = "C; C (programming language); Codes (SYMBOLS); Computer operating systems; Context switch; Cooperative task switching; Cooperatively multitasked threads; DOS; High level language; Lightweight tasker; Lightweight tasks; Microsoft compiler; Minicomputer platform; MIX Software; Modern operating systems; Multi-C package; Multiple processes; Multiprocessing systems; Multiprogramming; Multitasking system; Multithreading code; Multithreading package; Multiuser application; Multiuser mailing list management system; PC/DOS system; Preemptive task switching; Program compilers; Software engineering; Tenberry Software; Threads; Watcom compiler", pagecount = "2", thesaurus = "C listings; Multiprogramming; Software portability", } @Article{Fiske:1995:TPT, author = "Stuart Fiske and William J. Dally", title = "Thread prioritization: a thread scheduling mechanism for multiple-context parallel processors", journal = j-FUT-GEN-COMP-SYS, volume = "11", number = "6", pages = "503--518", month = oct, year = "1995", CODEN = "FGSEVI", ISSN = "0167-739X (print), 1872-7115 (electronic)", ISSN-L = "0167-739X", bibdate = "Sat Jan 10 12:00:22 MST 2004", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Future Generation Computer Systems", journal-URL = "http://www.sciencedirect.com/science/journal/0167739X", remark = "High-Performance Computer Architecture.", } @Article{Ford:1995:EDT, author = "Dan Ford", title = "Event-Driven Threads In {C++}", journal = j-DDJ, volume = "20", number = "6", pages = "48--50, 52, 54, 98, 100, 102", month = jun, year = "1995", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Thu Jan 9 09:35:43 MST 1997", bibsource = "Compendex database; http://www.ddj.com/index/author/index.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; UnCover database", abstract = "Dan presents a powerful, multithreaded architecture that can be used by almost any application. Implemented in C++, this class library lets you quickly create and control threads.", acknowledgement = ack-nhfb, affiliation = "Hewlett--Packard", classification = "721.1; 722.4; 723.1; 723.1.1; 723.2; 723.5; C6110J (Object-oriented programming); C6110P (Parallel programming); C6140D (High level languages)", fjournal = "Dr. Dobb's Journal of Software Tools", journalabr = "Dr Dobb's J Software Tools Prof Program", keywords = "C; C (programming language); C++; Computer aided software engineering; Computer architecture; Computer simulation; Data structures; Equivalence classes; Event driven threads; Hierarchical systems; Interthread communication; Message driven thread; Multithreaded; Multithreaded applications; Multithreading; Object oriented programming; Object oriented programming application; Object-oriented infrastructure; Parallel processing; Parallelism; Synchronization; Synchronization strategies", pagecount = "5", thesaurus = "C language; C listings; Object-oriented programming; Parallel programming", } @Article{Ford:1995:ETC, author = "Dan Ford", title = "Event-Driven Threads In {C++}", journal = j-DDJ, volume = "20", number = "6", pages = "48--50, 52, 54, 98, 100, 102", month = jun, year = "1995", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Thu Jan 9 09:35:43 MST 1997", bibsource = "Compendex database; http://www.ddj.com/index/author/index.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; UnCover database", abstract = "Dan presents a powerful, multithreaded architecture that can be used by almost any application. Implemented in C++, this class library lets you quickly create and control threads.", acknowledgement = ack-nhfb, affiliation = "Hewlett--Packard", classification = "721.1; 722.4; 723.1; 723.1.1; 723.2; 723.5; C6110J (Object-oriented programming); C6110P (Parallel programming); C6140D (High level languages)", fjournal = "Dr. Dobb's Journal of Software Tools", journalabr = "Dr Dobb's J Software Tools Prof Program", keywords = "C; C (programming language); C++; Computer aided software engineering; Computer architecture; Computer simulation; Data structures; Equivalence classes; Event driven threads; Hierarchical systems; Interthread communication; Message driven thread; Multithreaded; Multithreaded applications; Multithreading; Object oriented programming; Object oriented programming application; Object-oriented infrastructure; Parallel processing; Parallelism; Synchronization; Synchronization strategies", pagecount = "5", thesaurus = "C language; C listings; Object-oriented programming; Parallel programming", } @Book{Gao:1995:ATD, author = "Guang R. Gao and Lubomir Bic and Jean-Luc Gaudiot", title = "Advanced topics in dataflow computing and multithreading", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "x + 450", year = "1995", ISBN = "0-8186-6541-6 (hardcover), 0-8186-6540-8 (paperback), 0-8186-6542-4", ISBN-13 = "978-0-8186-6541-7 (hardcover), 978-0-8186-6540-0 (paperback), 978-0-8186-6542-4", LCCN = "QA76.9.A73 A356 1995", bibdate = "Sat Apr 20 11:22:41 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "computer architecture; data structures (computer science); parallel processing (electronic computers)", } @Article{Gerber:1995:IOX, author = "Bob Gerber", title = "{Informix} Online {XPS}", journal = j-SIGMOD, volume = "24", number = "2", pages = "463--463", month = may, year = "1995", CODEN = "SRECD8", ISSN = "0163-5808 (print), 1943-5835 (electronic)", ISSN-L = "0163-5808", bibdate = "Mon Jan 12 08:45:52 MST 2004", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classification = "C6110P (Parallel programming); C6150N (Distributed systems software); C6160B (Distributed databases)", fjournal = "ACM SIGMOD Record", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J689", keywords = "Informix Dynamic Scalable Architecture; Informix Extended Parallel Server; Informix Online XPS; Large SMP systems; Light access methods; Linear performance speedups; Loosely coupled environments; Massively parallel clusters; Online database servers; Online/DSA servers; Open systems spectrum; Parallel database systems; Parallel resource management; Pipelined hash partitioned operators; SMP based high performance parallel data query; Table partitioning; Uniprocessor systems; XPS; XPS multithreaded process groups", thesaurus = "Distributed databases; File servers; Parallel programming; Query processing", xxcrossref = "Anonymous:1995:ASI", } @Article{Girkar:1995:ETL, author = "Milind Girkar and Constantine D. Polychronopoulos", title = "Extracting Task-Level Parallelism", journal = j-TOPLAS, volume = "17", number = "4", pages = "600--634", month = jul, year = "1995", CODEN = "ATPSDT", ISSN = "0164-0925 (print), 1558-4593 (electronic)", ISSN-L = "0164-0925", bibdate = "Fri Jan 5 07:58:42 MST 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org/pubs/toc/Abstracts/0164-0925/210189.html", abstract = "Automatic detection of {\em task-level parallelism\/} (also referred to as functional, DAG, unstructured, or thread parallelism) at various levels of program granularity is becoming increasingly important for parallelizing and back-end compilers. Parallelizing compilers detect iteration-level or coarser granularity parallelism which is suitable for parallel computers; detection of parallelism at the statement-or operation-level is essential for most modern microprocessors, including superscalar and VLIW architectures. In this article we study the problem of detecting, expressing, and optimizing task-level parallelism, where ``task'' refers to a program statement of arbitrary granularity. Optimizing the amount of functional parallelism (by allowing synchronization between arbitrary nodes) in sequential programs requires the notion of {\em precedence\/} in terms of paths in graphs which incorporate control and data dependences. Precedences have been defined before in a different context; however, the definition was dependent on the ideas of parallel execution and time. We show that the problem of determining precedences statically is NP-complete. Determining precedence relationships is useful in finding the essential data dependences. We show that there exists a unique minimum set of essential data dependences; finding this minimum set is NP-hard and NP-easy. We also propose a heuristic algorithm for finding the set of essential data dependences. Static analysis of a program in the Perfect Benchmarks was done, and we present some experimental results.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Programming Languages and Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783", keywords = "algorithms; experimentation; languages; theory", subject = "{\bf D.3.4}: Software, PROGRAMMING LANGUAGES, Processors, Optimization. {\bf D.3.4}: Software, PROGRAMMING LANGUAGES, Processors, Compilers. {\bf F.1.3}: Theory of Computation, COMPUTATION BY ABSTRACT DEVICES, Complexity Classes, Reducibility and completeness. {\bf D.3.4}: Software, PROGRAMMING LANGUAGES, Processors, Code generation.", } @Article{Goossens:1995:FPM, author = "B. Goossens and D. T. Vu", title = "Further Pipelining and Multithreading to Improve {RISC} Processor Speed. {A} Proposed Architecture and Simulation Results", journal = j-LECT-NOTES-COMP-SCI, volume = "964", pages = "326--??", year = "1995", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Sat May 11 13:45:32 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @MastersThesis{Gulati:1995:MSM, author = "Manu Gulati", title = "Multithreading on a superscalar microprocessor", type = "Thesis ({M.S., Engineering})", school = "University of California, Irvine", address = "Irvine, CA, USA", pages = "x + 102", year = "1995", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Manual{Haines:1995:RSC, author = "Matthew Haines and Piyush Mehrotra and David Cronk", title = "Ropes, support for collective operations among distributed threads", number = "198157", publisher = pub-NTIS, address = pub-NTIS:adr, pages = "??", year = "1995", LCCN = "NAS 1.26:198157 Govt Pubs", bibdate = "Fri May 10 12:18:17 MDT 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "Shipping list number 96-0037-M", series = "NASA contractor report", acknowledgement = ack-nhfb, keywords = "computer system design; distributed processing; interprocessor communication; memory (computers); numerical control; parallel programming; threads", } @Article{Jensen:1995:DRT, author = "E. Douglas Jensen", title = "Distributed real-time operating systems", journal = j-DDJ, volume = "20", number = "2", pages = "32--34, 36, 38", month = feb, year = "1995", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Tue Sep 10 08:45:36 MDT 1996", bibsource = "http://www.ddj.com/index/author/index.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classification = "C6150N (Distributed systems software)", fjournal = "Dr. Dobb's Journal of Software Tools", keywords = "Distributed objects; Distributed operating systems; Operating systems; Real-time computing; Real-time operating systems; Real-time paradigm; Threads", thesaurus = "Network operating systems; Real-time systems", } @Article{Kavi:1995:DCM, author = "Krishna M. Kavi and A. R. Hurson and Phenil Patadia and Elizabeth Abraham and Ponnarasu Shanmugam", title = "Design of cache memories for multi-threaded dataflow architecture", journal = j-COMP-ARCH-NEWS, volume = "23", number = "2", pages = "253--264", month = may, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Kawamoto:1995:MTP, author = "S.-I. Kawamoto and T. Ito", title = "Multi-threaded {PaiLisp} with Granularity Adaptive Parallel Execution", journal = j-LECT-NOTES-COMP-SCI, volume = "907", pages = "94--??", year = "1995", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Sat May 11 13:45:32 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Kleiman:1995:IT, author = "Steve Kleiman and Joe Eykholt", title = "Interrupts as threads", journal = j-OPER-SYS-REV, volume = "29", number = "2", pages = "21--26", month = apr, year = "1995", CODEN = "OSRED8", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:41 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @Book{Kleiman:1995:PT, author = "Steve Kleiman and Devang Shah and Bart Smaalders", title = "Programming With Threads", publisher = pub-SUNSOFT, address = pub-SUNSOFT:adr, pages = "xxviii and 534", year = "1995", ISBN = "0-13-172389-8", ISBN-13 = "978-0-13-172389-4", LCCN = "QA76.58.K59 1996", bibdate = "Wed Dec 09 12:51:22 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", price = "US\$48.00", URL = "http://www.amazon.com/exec/obidos/ISBN=0131723898/sunworldonlineA/002-4892305-5599452", acknowledgement = ack-nhfb, } @Article{Lam:1995:CPC, author = "Richard B. Lam", title = "Cross-platform communication classes", journal = j-DDJ, volume = "20", number = "3", pages = "20, 22, 24, 26", month = mar, year = "1995", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Tue Sep 10 08:45:36 MDT 1996", bibsource = "http://www.ddj.com/index/author/index.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; UnCover database", abstract = "Richard summarizes common techniques for interprocess communication, presenting a library that implements semaphores in a platform-independent manner to allow signaling or controlling of shared resources between processes and threads.", acknowledgement = ack-nhfb, classification = "C5620L (Local area networks); C6110J (Object-oriented programming); C6140D (High level languages); C6150N (Distributed systems software)", fjournal = "Dr. Dobb's Journal of Software Tools", keywords = "AIX; C++ libraries; Client/server computing; Cross platform C++ libraries; Cross-platform communication classes; Example library; Graphical user interfaces; Interprocess communications; OS/2; Semaphores; Shared resources; Windows NT", thesaurus = "C language; Client-server systems; Object-oriented languages; Object-oriented programming; Resource allocation; Software libraries", } @Article{Larcheveque:1995:OIP, author = "J.-M. Larchev{\^e}que", title = "Optimal Incremental Parsing", journal = j-TOPLAS, volume = "17", number = "1", pages = "1--15", month = jan, year = "1995", CODEN = "ATPSDT", ISSN = "0164-0925 (print), 1558-4593 (electronic)", ISSN-L = "0164-0925", bibdate = "Fri Jan 5 07:58:42 MST 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org/pubs/toc/Abstracts/0164-0925/200996.html", abstract = "This communication sets the problem of incremental parsing in the context of a complete incremental compiling system. It turns out that, according to the incrementally paradigm of the attribute evaluator and data-flow analyzer to be used, two definitions of optimal incrementality in a parser are possible. Algorithms for achieving both forms of optimality are given, both of them based on ordinary LALR(1) parse tables. Optimality and correctness proofs, which are merely outlined in this communication, are made intuitive thanks to the concept of a {\em well-formed list of threaded trees}, a natural extension of the concept of {\em threaded tree\/} found in earlier works on incremental parsing.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Programming Languages and Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783", keywords = "algorithms; performance; theory", subject = "{\bf D.3.4}: Software, PROGRAMMING LANGUAGES, Processors, Parsing. {\bf D.2.6}: Software, SOFTWARE ENGINEERING, Programming Environments, Interactive. {\bf D.3.4}: Software, PROGRAMMING LANGUAGES, Processors, Compilers. {\bf E.1}: Data, DATA STRUCTURES, Trees.", } @Article{Lenatti:1995:RPM, author = "C. Lenatti", title = "{Rethinking in Parallel: Multiprocessing is on the rise, despite a dearth of tools to help create multithreaded applications}", journal = j-UNIXWORLD-OPEN-COMP, volume = "12", number = "8", pages = "57--??", year = "1995", CODEN = "OPCOEB", ISSN = "1072-4044", bibdate = "Fri Jan 26 17:24:01 MST 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "UnixWorld's Open Computing", } @Article{Leppanen:1995:PWO, author = "Ville Lepp{\"a}nen", title = "Performance of work-optimal {PRAM} simulation algorithms on coated meshes", journal = j-COMP-J, volume = "38", number = "10", pages = "801--810", month = "????", year = "1995", CODEN = "CMPJA6", ISSN = "0010-4620 (print), 1460-2067 (electronic)", ISSN-L = "0010-4620", bibdate = "Wed Jul 21 09:54:40 MDT 1999", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www3.oup.co.uk/computer_journal/Volume_38/Issue_10/Vol38_10.index.html", URL = "http://www3.oup.co.uk/computer_journal/Volume_38/Issue_10/Vol38_10.body.html#AbstractLeppanen", acknowledgement = ack-nhfb, author-1-adr = "Department of Computer Science, University of Turku, Lemmink{\"a}isenkatu 14-18, Datacity, FIN-20520 Turku, Finland", classcodes = "C5220P (Parallel architecture); C7430 (Computer engineering); C5320G (Semiconductor storage); C6110P (Parallel programming); C4240C (Computational complexity)", corpsource = "Dept. of Comput. Sci., Turku Univ., Finland", email-1 = "Ville.Leppanen@cs.utu.fi", fjournal = "The Computer Journal", journal-URL = "http://comjnl.oxfordjournals.org/", keywords = "architectures; coated meshes; combining queues method; computational complexity; cost; greedy routing; mesh connected routing machinery; multithreading level; parallel; parallel algorithms; random-access storage; routing steps; simulated PRAM processors; simulation; sorting; synchronization wave; virtual leveled network technique; virtual machines; work optimal PRAM simulation algorithms", treatment = "P Practical", } @TechReport{Lim:1995:LPB, author = "Beng-Hong Lim and Ricardo Bianchini", title = "Limits on the performance benefits of multithreading and prefetching", type = "Research report", number = "RC 20238 (89547)", institution = "IBM T. J. Watson Research Center", address = "Yorktown Heights, NY, USA", pages = "23", day = "20", month = oct, year = "1995", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, annote = "Supported in part by ARPA. Supported in part by NSF Experimental Systems. Supported in part by a NSF Presidential Young Investigator Award", keywords = "Cache memory; Fault-tolerant computing; Multiprocessors", } @MastersThesis{Loikkanen:1995:FMS, author = "Matias Loikkanen", title = "A fine-grain multithreading superscalar architecture", type = "Thesis ({M.S., Engineering})", school = "University of California, Irvine", address = "Irvine, CA, USA", pages = "xi + 103", year = "1995", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @MastersThesis{Lu:1995:HMC, author = "Howard J. (Howard Jason) Lu", title = "Heterogeneous multithreaded computing", type = "Thesis ({M. Eng.})", school = "Massachusetts Institute of Technology, Department of Electrical Engineering and Computer Science", address = "Cambridge, MA, USA", pages = "21", year = "1995", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Maquelin:1995:CBM, author = "O. C. Maquelin and H. H. J. Hum and G. R. Gao", title = "Costs and Benefits of Multithreading with Off-the-Shelf {RISC} Processors", journal = j-LECT-NOTES-COMP-SCI, volume = "966", pages = "117--??", year = "1995", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Sat May 11 13:45:32 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @TechReport{Marsland:1995:SSM, author = "T. A. Marsland and Yaoqing Gao and Francis Chi-Moon Lau", title = "A study of software multithreading in distributed systems", type = "Technical report", number = "TR 95-23", institution = "Dept. of Computing Science, University of Alberta", address = "Edmonton, AB, Canada", pages = "25", year = "1995", ISSN = "0316-4683", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Mayes:1995:ULT, author = "K. R. Mayes and S. Quick and B. C. Warboys", title = "User-level threads on a general hardware interface", journal = j-OPER-SYS-REV, volume = "29", number = "4", pages = "57--62", month = oct, year = "1995", CODEN = "OSRED8", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:52 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @MastersThesis{Metz:1995:IDS, author = "David Metz", title = "Interface design and system impact analysis of a message-handling processor for fine-grain multithreading", type = "Thesis ({M.S.})", school = "Oregon State University", address = "Corvallis, OR, USA", pages = "63", year = "1995", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Multiprocessors; Parallel processing (Electronic computers)", } @MastersThesis{Miller:1995:TPC, author = "Robert C. (Robert Chisolm) Miller", title = "A type-checking preprocessor for {Cilk 2}, a multithreaded {C} language", type = "Thesis ({M. Eng.})", school = "Massachusetts Institute of Technology, Department of Electrical Engineering and Computer Science", address = "Cambridge, MA, USA", pages = "38", year = "1995", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @PhdThesis{Moore:1995:MPD, author = "Simon W. Moore", title = "Multithreaded processor design", type = "Thesis ({Ph.D.})", school = "University of Cambridge, Computer Laboratory", address = "Cambridge, Cambridgeshire, UK", pages = "xvi + 125", month = feb, year = "1995", LCCN = "QA76.9.A73 M66 1995", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "Available as Technical Report 358.", abstract = "Multithreaded processors aim to improve upon both control-flow and data-flow processor models by forming some amalgam of the two. They combine sequential behaviour from the control-flow model with concurrent aspects from data-flow design. Some multithreaded processor designs have added just a little concurrency to control-flow or limited sequential execution to data-flow. This thesis demonstrates that more significant benefits may be obtained by a more radical amalgamation of the two models. A data-driven microthread model is proposed, where a microthread is a short control-flow code sequence. To demonstrate the efficiency of this model, a suitable multithreaded processor, called Anaconda, is designed and evaluated. Anaconda incorporates a scalable temporally predictable memory tree structure with distributed virtual address translation and memory protection. A temporally predictable cached direct-mapped matching store is provided to synchronise data to microthreads. Code is prefetched into an instruction cache before execution commences. Earliest-deadline-first or fixed-priority scheduling is supported via a novel hardware priority queue. Control-flow execution is performed by a modified Alpha 21064 styled pipeline which assists comparison with commercial processors.", acknowledgement = ack-nhfb, annote = "Supported in part by a studentship from the UK Science and Engineering Research Council", keywords = "Computer architecture", } @Article{Oikawa:1995:RDU, author = "Shuichi Oikawa and Hideyuki Tokuda", title = "Reflection of developing user-level real-time thread packages", journal = j-OPER-SYS-REV, volume = "29", number = "4", pages = "63--76", month = oct, year = "1995", CODEN = "OSRED8", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:52 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @Article{Prabhakar:1995:IDO, author = "Ernest N. Prabhakar", title = "Implementing Distributed Objects", journal = j-DDJ, volume = "20", number = "8", pages = "80, 82, 84--85, 105--106", month = aug, year = "1995", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Thu Jan 9 09:35:43 MST 1997", bibsource = "Compendex database; http://www.ddj.com/index/author/index.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; UnCover database", abstract = "Ernest uses NeXT's PDO and Objective-C to implement a simple client-server application that packages a legacy application into an interoperable object and its client.", acknowledgement = ack-nhfb, affiliation = "NextStep\slash OpenStep User Groups Int", classification = "722.1; 722.2; 722.3; 722.4; 723.1; C5620L (Local area networks); C6110J (Object-oriented programming); C6110P (Parallel programming); C6140D (High level languages)", fjournal = "Dr. Dobb's Journal of Software Tools", journalabr = "Dr Dobb's J Software Tools Prof Program", keywords = "Codes (symbols); Computer networks; Distributed applications; Distributed computer systems; Distributed objects; Interfaces (COMPUTER); Interoperable object; Interoperable objects; Legacy application; Multithreaded object; Network protocols; NeXT; Object oriented programming; Objective-C; PDO; Portable distributed objects; Program compilers; Simple client server application; Software prototyping; Storage allocation (computer); Table lookup", pagecount = "4", thesaurus = "C language; C listings; Client-server systems; Object-oriented programming; Parallel programming", } @Article{Prasad:1995:WNT, author = "Shashi Prasad", title = "{Windows NT} Threads --- a multithreaded application may actually run slower on an {SMP} machine than on its single-threaded equivalent. {Here}'s how to avoid that", journal = j-BYTE, volume = "20", number = "11", pages = "253--??", month = nov, year = "1995", CODEN = "BYTEDJ", ISSN = "0360-5280 (print), 1082-7838 (electronic)", ISSN-L = "0360-5280", bibdate = "Mon Aug 19 08:30:25 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "BYTE Magazine", } @Article{Prasad:1995:WTS, author = "Shashi Prasad", title = "Weaving a Thread --- {Solaris} and {Windows NT} bring the power, speed, and efficiency of multithreading and symmetric multiprocessing to the desktop", journal = j-BYTE, volume = "20", number = "10", pages = "173--??", month = oct, year = "1995", CODEN = "BYTEDJ", ISSN = "0360-5280 (print), 1082-7838 (electronic)", ISSN-L = "0360-5280", bibdate = "Mon Aug 19 08:30:21 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "BYTE Magazine", } @Book{Reich:1995:DHP, author = "David E. Reich", title = "Designing high-powered {OS/2 Warp} applications: the anatomy of multithreaded programs", publisher = pub-WILEY, address = pub-WILEY:adr, pages = "xxxi + 336", year = "1995", ISBN = "0-471-11586-X (paperback)", ISBN-13 = "978-0-471-11586-1 (paperback)", LCCN = "QA76.76.O63R437 1995", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Application software; Microcomputers -- Operating systems; Operating systems (Computers); OS/2 Warp", } @Article{Rodens:1995:ESC, author = "Ira Rodens", title = "Examining {Symantec C++} 7.0", journal = j-DDJ, volume = "20", number = "8", pages = "86--89, 106--107", month = aug, year = "1995", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Thu Jan 9 09:35:43 MST 1997", bibsource = "Compendex database; http://www.ddj.com/index/author/index.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; UnCover database", abstract = "Among other features, this recent incarnation of Symantec C++ sports a visual programming environment, class and hierarchy editors, distributed build tools, and support for templates, exceptions, and run-time type identification. Compiler author Walter Bright adds tips and techniques for optimizing C++ code.", acknowledgement = ack-nhfb, affiliation = "CompuServe", classification = "722.2; 723.1; 723.1.1; 723.5; C6110J (Object-oriented programming); C6110V (Visual programming); C6115 (Programming support); C6130B (Graphics techniques); C6150G (Diagnostic, testing, debugging and evaluating systems); C6180G (Graphical user interfaces)", fjournal = "Dr. Dobb's Journal of Software Tools", journalabr = "Dr Dobb's J Software Tools Prof Program", keywords = "32-Bit multithreaded linker; Benchmarking; Browsers; Build tasks; C (programming language); C++ language; Codes (SYMBOLS); Computer programming; Distributed build tools; DOS; Exceptions an; Express Agents; File editors; Graphical user interfaces; Hierarchy editors; LAN; Linker; Multiscope debugger; Program compilers; Program debugging; Run time type identification; Run time type identification programming environment; Software engineering; Symantec C++ 7; Templates; Upgraded Microsoft Foundation Classes; Visual programming; Visual programming environment; Visual tools; Windows 95 resources", thesaurus = "Graphical user interfaces; Object-oriented programming; Program debugging; Software reviews; Software tools; Visual programming", } @Article{Rodley:1995:TPU, author = "John Rodley", title = "Thread Programming In {UnixWare} 2.0", journal = j-DDJ, volume = "20", number = "6", pages = "56, 58--61, 102, 104", month = jun, year = "1995", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Thu Jan 9 09:35:43 MST 1997", bibsource = "Compendex database; http://www.ddj.com/index/author/index.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; UnCover database", abstract = "With the advent of UnixWare 2.0, threads have made their way to the UNIX desktop. John describes how threads are implemented and how you can take advantage of them.", acknowledgement = ack-nhfb, classification = "722.2; 722.4; 723.1; 723.2; 723.5; C6110P (Parallel programming); C6150J (Operating systems); C6150N (Distributed systems software)", fjournal = "Dr. Dobb's Journal of Software Tools", journalabr = "Dr Dobb's J Software Tools Prof Program", keywords = "Computer aided software engineering; Computer programming; Computer simulation; Concurrency programming; Fork; Lightweight processes; Multiprocessing; Multiprocessing systems; Multithreading; Object oriented programming; P1003.lc; Parallel programming; POSIX Portable Operating Systems Standard; Real time systems; Signal processing; Thread programming; Thread specification; UNIX; UnixWare 2.0; User interfaces", pagecount = "5", thesaurus = "Multiprocessing programs; Parallel programming; Unix", } @Article{Rogers:1995:SDD, author = "Anne Rogers and Martin C. Carlisle and John H. Reppy and L. J. Hendren", title = "Supporting Dynamic Data Structures on Distributed-Memory Machines", journal = j-TOPLAS, volume = "17", number = "2", pages = "233--263", month = mar, year = "1995", CODEN = "ATPSDT", ISSN = "0164-0925 (print), 1558-4593 (electronic)", ISSN-L = "0164-0925", bibdate = "Fri Jan 5 07:58:42 MST 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org/pubs/toc/Abstracts/0164-0925/201065.html", abstract = "Compiling for distributed-memory machines has been a very active research area in recent years. Much of this work has concentrated on programs that use arrays as their primary data structures. To date, little work has been done to address the problem of supporting programs that use pointer-based dynamic data structures. The techniques developed for supporting SPMD execution of array-based programs rely on the fact that arrays are statically defined and directly addressable. Recursive data structures do not have these properties, so new techniques must be developed. In this article, we describe an execution model for supporting programs that use pointer-based dynamic data structures. This model uses a simple mechanism for migrating a thread of control based on the layout of heap-allocated data and introduces parallelism using a technique based on futures and lazy task creation. We intend to exploit this execution model using compiler analyses and automatic parallelization techniques. We have implemented a prototype system, which we call {\em Olden}, that runs on the Intel iPSC/860 and the Thinking Machines CM-5. We discuss our implementation and report on experiments with five benchmarks.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Programming Languages and Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783", keywords = "experimentation; languages; measurement; performance", subject = "{\bf D.3.4}: Software, PROGRAMMING LANGUAGES, Processors, Run-time environments. {\bf D.1.3}: Software, PROGRAMMING TECHNIQUES, Concurrent Programming, Parallel programming. {\bf D.3.4}: Software, PROGRAMMING LANGUAGES, Processors, Compilers. {\bf D.3.3}: Software, PROGRAMMING LANGUAGES, Language Constructs and Features, Data types and structures. {\bf D.3.3}: Software, PROGRAMMING LANGUAGES, Language Constructs and Features, Dynamic storage management.", } @PhdThesis{Roh:1995:CGE, author = "Lucas J. Roh", title = "Code generations, evaluations, and optimizations in multithreaded executions", type = "Thesis ({Ph.D.})", school = inst-CSU, address = inst-CSU:adr, pages = "ix + 154", year = "1995", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Code generators; Computer architecture; Parallel processing (Electronic computers)", } @InProceedings{Schauser:1995:SCP, author = "Klaus E. Schauser and David E. Culler and Seth C. Goldstein", title = "Separation constraint partitioning: a new algorithm for partitioning non-strict programs into sequential threads", crossref = "ACM:1995:CRP", pages = "259--271", year = "1995", bibdate = "Mon May 3 12:52:30 MDT 1999", bibsource = "http://www.acm.org/pubs/toc/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/proceedings/plan/199448/p259-schauser/", abstract = "In this paper we present substantially improved thread partitioning algorithms for modern implicitly parallel languages. We present a new block partitioning algorithm, {\em separation constraint partitioning\/}, which is both more powerful and more flexible than previous algorithms. Our algorithm is guaranteed to derive maximal threads. We present a theoretical framework for proving the correctness of our partitioning approach, and we show how separation constraint partitioning makes interprocedural partitioning viable. We have implemented the partitioning algorithms in an Id90 compiler for workstations and parallel machines. Using this experimental platform, we quantify the effectiveness of different partitioning schemes on whole applications.", acknowledgement = ack-nhfb, keywords = "algorithms; experimentation; languages; theory; verification", subject = "{\bf D.3.2} Software, PROGRAMMING LANGUAGES, Language Classifications, Parallel C. {\bf D.3.4} Software, PROGRAMMING LANGUAGES, Processors, Compilers. {\bf F.2.2} Theory of Computation, ANALYSIS OF ALGORITHMS AND PROBLEM COMPLEXITY, Nonnumerical Algorithms and Problems, Computations on discrete structures. {\bf F.3.3} Theory of Computation, LOGICS AND MEANINGS OF PROGRAMS, Studies of Program Constructs.", } @MastersThesis{Shahnaz:1995:DMD, author = "Munira Shahnaz", title = "Design of a multithreaded data cache for a hyperscalar processor", type = "Thesis ({M.S.})", school = "Department of Electrical Engineering, Texas A\&M University", address = "College Station, TX, USA", pages = "xi + 80", year = "1995", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Major electrical engineering", } @PhdThesis{Shankar:1995:STI, author = "Bhanu Shankar", title = "The spectrum of thread implementations on hybrid multithreaded architectures", type = "Thesis ({Ph.D.})", school = inst-CSU, address = inst-CSU:adr, pages = "xi + 176", year = "1995", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Computer architecture; Parallel processing (Electronic computers)", } @TechReport{Small:1995:SAB, author = "Christopher Small and Margo Seltzer", title = "Scheduler activations on {BSD}: sharing thread management between kernel and application", type = "Technical Report", number = "31-95", institution = "Center for Research in Computing Technology, Harvard University", address = "Cambridge, MA, USA", pages = "12", year = "1995", bibdate = "Tue Sep 17 07:11:15 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Spertus:1995:ELB, author = "Ellen Spertus and William J. Dally", title = "Evaluating the locality benefits of active messages", journal = j-SIGPLAN, volume = "30", number = "8", pages = "189--198", month = aug, year = "1995", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:17:08 MST 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "A major challenge in fine-grained computing is achieving locality without excessive scheduling overhead. We built two J-Machine implementations of a fine-grained programming model, the Berkeley Threaded Abstract Machine. One implementation takes an active messages approach, maintaining a scheduling hierarchy in software in order to improve data cache performance. Another approach relies on the J-Machine's message queues and fast task switch, lowering the control costs at the expense of data locality. Our analysis measures the costs and benefits of each approach, for a variety of programs and cache configurations. The active messages implementation is strongest when miss penalties are high and for the finest-grained programs. The hardware-buffered implementation is strongest in direct-mapped caches, where it achieves substantially better instruction cache performance.", acknowledgement = ack-nhfb, affiliation = "Lab. for Comput. Sci., MIT, Cambridge, MA, USA", classification = "C6110P (Parallel programming); C6120 (File organisation); C6150C (Compilers, interpreters and other processors); C6150N (Distributed systems software)", fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "Active messages; Benefits; Berkeley Threaded Abstract Machine; Cache configuration; Costs; Data cache performance; Data locality; Direct-mapped caches; Fast task switch; Fine-grained computing; Fine-grained programming model; Hardware-buffered; Instruction cache performance; J-Machine; Locality benefits; Message queues; Miss penalties; Scheduling hierarchy; Scheduling overhead", thesaurus = "Cache storage; Cost-benefit analysis; Parallel programming; Program compilers; Scheduling; Software performance evaluation", } @Article{Srinivasan:1995:MMX, author = "Murali V. Srinivasan", title = "A Methodology for Multithreaded {X} Client Development", journal = j-X-RESOURCE, volume = "13", number = "1", pages = "181--181", month = jan, year = "1995", CODEN = "XRESEA", ISBN = "1-56592-121-6", ISBN-13 = "978-1-56592-121-4", ISSN = "1058-5591", bibdate = "Fri Mar 31 06:55:49 1995", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "The X Resource", } @Article{Steensgaard:1995:ONC, author = "B. Steensgaard and E. Jul", title = "Object and native code thread mobility among heterogeneous computers (includes sources)", journal = j-OPER-SYS-REV, volume = "29", number = "5", pages = "68--77", month = dec, year = "1995", CODEN = "OSRED8", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:55 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @Article{Stuckey:1995:FCI, author = "Richard Stuckey", title = "A fully conformant implementation of {ECMA-162}", journal = j-ADA-USER, volume = "16", number = "2", pages = "83--94", month = jun, year = "1995", CODEN = "AUJOET", ISSN = "0268-652X", bibdate = "Mon Sep 8 18:43:50 MDT 1997", bibsource = "Compendex database; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "ICL has developed a portable implementation of the Ada interfaces to PCTE as specified by ECMA-162. The interfaces map the functionality required onto that provided by the C interfaces to PCTE as specified by ECMA-158. The process of implementing the interfaces revealed a number of errors in the ECMA PCTE standards, such as errors in ECMA-162 concerning the mapping of ECMA-149 onto Ada, errors in ECMA-158 such as missing operations or functions with incorrect parameter modes, discrepancies between the Ada and C bindings and errors in ECMA-149. The architecture of the interfaces and their test harness has been designed to allow easy porting from one PCTE implementation to another, and also from one Ada compilation system to another; some major constraints were imposed by the use of the C interfaces as the underlying platform, particularly regarding Ada's multi-threading abilities. The advantages of using the interfaces include the benefits of being able to implement tools in Ada instead of C; insulation from the underlying PCTE implementation; and the provision of facilities (e.g. call tracing) between tools and PCTE.", acknowledgement = ack-nhfb, affiliation = "ICL Enterprises", affiliationaddress = "Reading, Engl", classification = "722.2; 723.1; 723.1.1; 723.5; 902.2; C6115 (Programming support); C6140D (High level languages)", corpsource = "ICL Enterprises, Reading, UK", fjournal = "Ada User", journalabr = "Ada User J", keywords = "Ada; Ada (programming language); Ada compilation system; Ada interfaces; application program interfaces; bindings; C (programming language); C interfaces; call tracing; Codes (symbols); Computer aided software engineering; ECMA PCTE standards; ECMA-149; ECMA-158; ECMA-162; Errors; errors; fully conformant implementation; incorrect parameter modes; missing operations; multi-threading abilities; Portable Common Tools Environment; portable implementation; programming environments; software portability; software standards; software tools; Standards; test harness; User interfaces", pubcountry = "Netherlands", treatment = "P Practical", } @Book{SunSoft:1995:SMP, author = "{SunSoft}", title = "{Solaris} multithreaded programming guide", publisher = pub-SUNSOFT, address = pub-SUNSOFT:adr, pages = "xviii + 158", year = "1995", ISBN = "0-13-160896-7", ISBN-13 = "978-0-13-160896-2", LCCN = "QA76.76.O63 S635 1995", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Multiprocessors; Operating systems (Computers); Solaris (Computer file); UNIX (Computer file)", } @Article{Tamasanis:1995:MMW, author = "Doug Tamasanis", title = "{Mathematica} meets {Warp}", journal = j-BYTE, volume = "20", number = "5", month = may, year = "1995", CODEN = "BYTEDJ", ISSN = "0360-5280 (print), 1082-7838 (electronic)", ISSN-L = "0360-5280", bibdate = "Fri May 24 09:57:14 MDT 1996", bibsource = "Compendex database; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Wolfram Research has ported Mathematica, the software tool for quantitative analysis, from its Macintosh origins to a wide range of platforms, including PCs, Unix workstations, and several larger systems. The latest port of Mathematica 2.2 is to OS/2 Warp. Now OS/2 users do not have to rely on the Windows version of the Mathematica kernel, which only simulates multithreading. The new release takes full advantage of the OS/2 preemptive scheduler, threading, and 32-bit flat memory structure to both improve performance and to greatly increase the size of the problems Mathematica can handle. The OS/2 version is found faster and more stable than the Windows version.", acknowledgement = ack-nhfb, affiliation = "BYTE", classification = "722.2; 723.1; 723.1.1; 723.2; 723.5", fjournal = "BYTE Magazine", journalabr = "Byte", keywords = "C (programming language); Command line interface; Computer aided software engineering; Computer architecture; Computer operating systems; Computer simulation; Computer software; File editors; FORTRAN (programming language); Graphical user interfaces; Network protocols; Performance; Software Package Mathematica; Word processing", pagecount = "3", } @Article{Taylor:1995:CSA, author = "Richard N. Taylor and Kari A. Nies and Gregory Alan Bolcer and Craig A. MacFarlane and Kenneth M. Anderson and Gregory F. Johnson", title = "Chiron-1: a software architecture for user interface development, maintenance, and run-time support", journal = j-TOCHI, volume = "2", number = "2", pages = "105--144", month = jun, year = "1995", CODEN = "ATCIF4", ISSN = "1073-0516 (print), 1557-7325 (electronic)", ISSN-L = "1073-0516", bibdate = "Tue Jan 19 05:49:17 MST 1999", bibsource = "http://www.acm.org/pubs/contents/journals/tochi/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/journals/tochi/1995-2-2/p105-taylor/", abstract = "The Chiron-1 user interface system demonstrates key techniques that enable a strict separation of an application from its user interface. These techniques include separating the control-flow aspects of the application and user interface: they are concurrent and may contain many threads. Chiron also separates windowing and look-and-feel issues from dialogue and abstract presentation decisions via mechanisms employing a client-server architecture. To separate application code from user interface code, user interface agents called {\em artists\/} are attached to instances of application abstract data types (ADTs). Operations on ADTs within the application implicitly trigger user interface activities within the artists. Multiple artists can be attached to ADTs, providing multiple views and alternative forms of access and manipulation by either a single user or by multiple users. Each artist and the application run in separate threads of control. Artists maintain the user interface by making remote calls to an abstract depiction hierarchy in the Chiron server, insulting the user interface code from the specifics of particular windowing systems and toolkits. The Chiron server and clients execute in separate processes. The client-server architecture also supports multilingual systems: mechanisms are demonstrated that support clients written in programming languages other than that of the server while nevertheless supporting object-oriented server concepts. The system has been used in several universities and research and development projects. It is available by anonymous ftp.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Computer-Human Interaction", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J756", keywords = "design; languages", subject = "{\bf H.5.2} Information Systems, INFORMATION INTERFACES AND PRESENTATION, User Interfaces, User interface management systems (UIMS). {\bf D.2.2} Software, SOFTWARE ENGINEERING, Design Tools and Techniques, User interfaces. {\bf D.2.m} Software, SOFTWARE ENGINEERING, Miscellaneous, Reusable software**.", } @PhdThesis{Thekkath:1995:DPM, author = "Radhika Thekkath", title = "Design and performance of multithreaded architectures", type = "Thesis ({Ph.D.})", school = "University of Washington", address = "Seattle, WA, USA", pages = "x + 100", year = "1995", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Computer architecture; Multiprocessors", } @MastersThesis{Todiwala:1995:DRT, author = "Khushroo Rustom Todiwala", title = "A distributed ray tracing implementation using multithreaded {RPC}", type = "Thesis ({M.S.})", number = "4691", school = "University of Texas at El Paso", address = "El Paso, TX, USA", pages = "xi + 140", year = "1995", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = "Master's thesis / University of Texas at El Paso", acknowledgement = ack-nhfb, keywords = "Electronic data processing -- Distributed processing", } @TechReport{Toulouse:1995:CID, author = "Michel Toulouse and Teodor Gabriel Crainic and Michel Gendreau", title = "Communication issues in designing cooperative multi-thread parallel searches", type = "Report", number = "CRT-95-47", institution = "Centre de recherche sur les transports, Universit{\'e} de Montr{\'e}al", address = "Montr{\'e}al, Qu{\'e}bec, Canada", year = "1995", bibdate = "Sat Apr 20 11:20:32 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Tullsen:1995:SMM, author = "Dean M. Tullsen and Susan J. Eggers and Henry M. Levy", title = "Simultaneous multithreading: maximizing on-chip parallelism", journal = j-COMP-ARCH-NEWS, volume = "23", number = "2", pages = "392--403", month = may, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", remark = "According to Hennessy and Patterson, Computer Architecture, 6th edition, online appendix M ``Historical Perspectives and References'', page M-36, this paper's authors ``provided the first realistic simulation assessment and coined the term {\em simultaneous multithreading}.''", } @Article{vanHoff:1995:JIP, author = "Arthur {van Hoff}", title = "{Java} and {Internet} Programming", journal = j-DDJ, volume = "20", number = "8", pages = "56, 58, 60--61, 101--102", month = aug, year = "1995", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Thu Jan 9 09:35:43 MST 1997", bibsource = "Compendex database; http://www.ddj.com/index/author/index.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; UnCover database", URL = "http://www.ddj.com/ddj/issues/j508a.htm", abstract = "Java, a language designed for Internet development, is an object-oriented, multithreaded, portable, dynamic language that's similar to C, yet simpler than C++.", abstract2 = "In 1990, a new language called `Java' was developed which, it turns out, addresses many of the issues of software distribution on the Internet. Java is a simple, object-oriented, multi-threaded, garbage-collected, secure, robust, architecture-neutral, portable, high-performance, dynamic language. The language is similar to C and C++ but much simpler. Java programs are compiled into a binary format that can be executed on many platforms without recompilation. The language contains mechanisms to verify and execute binary Java programs in a controlled environment, protecting computer from potential viruses and security violations.", acknowledgement = ack-nhfb, affiliation = "Sun Microsystems", classification = "721.1; 722.2; 722.3; 723.1; 723.1.1; C6110J (Object-oriented programming); C6140D (High level languages); C6150N (Distributed systems software)", fjournal = "Dr. Dobb's Journal of Software Tools", journalabr = "Dr Dobb's J Software Tools Prof Program", keywords = "Architecture-neutral language; Binary format; Browser; Bytecodes; Bytecodes, Java language; C (programming language); Codes (symbols); Compilation; Computational linguistics; Computer networks; Computer programming languages; Computer software portability; Garbage-collection; High-performance dynamic language; Interactive programs; Interfaces (computer); Internet; Internet programming; Java (programming language); Multithreaded language; Multithreading; Object oriented programming; Object-oriented language; Portable language; Program compilers; Program interpreters; Robust language; Secure language; Security of data; Semantics; Software distribution; Software engineering; Syntax; UNIX", pagecount = "4", thesaurus = "Complete computer programs; Internet; Object-oriented languages; Object-oriented programming; Security of data; Software portability", } @Article{Wallach:1995:OAM, author = "Deborah A. Wallach and Wilson C. Hsieh and Kirk L. Johnson and M. Frans Kaashoek and William E. Weihl", title = "Optimistic active messages: a mechanism for scheduling communication with computation", journal = j-SIGPLAN, volume = "30", number = "8", pages = "217--226", month = aug, year = "1995", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:17:08 MST 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Low-overhead message passing is critical to the performance of many applications. Active messages (AMs) reduce the software overhead for message handling: messages are run as handlers instead of as threads, which avoids the overhead of thread management and the unnecessary data copying of other communication models. Scheduling the execution of AMs is typically done by disabling and enabling interrupts or by polling the network. This primitive scheduling control puts severe restrictions on the code that can be run in a message handler. This paper describes a new software mechanism, optimistic active messages (OAM), that eliminates these restrictions; OAMs allow arbitrary user code to execute in handlers, and also allow handlers to block. Despite this gain in expressiveness, OAMs perform as well as AMs. We used OAM as the base for a remote procedure calling (RPC) system, Optimistic RPC (ORPC), for the CM-5 multiprocessor; it consists of an optimized thread package and a stub compiler that hides communication details from the programmer. ORPC is 1.5 to 5 times faster than traditional RPC (TRPC) for small messages and performs as well as AMs. Applications that primarily communicate using large data transfers or are fairly coarse-grained perform equally well. For applications that send many short messages, however, the ORPC and AM implementations are up to 3 times faster than the TRPC implementations. Using ORPC, programmers obtain the benefits of well-proven programming abstractions, do not have to be concerned with communication details, and yet obtain nearly the performance of hand-coded AM programs.", acknowledgement = ack-nhfb, affiliation = "Lab. for Comput. Sci., MIT, Cambridge, MA, USA", classification = "C6150N (Distributed systems software)", fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "Application performance; Arbitrary user code; Blocking; CM-5 multiprocessor; Coarse-grained applications; Communication detail hiding; Communication scheduling; Computation scheduling; Expressiveness; Large data transfers; Low-overhead message passing; Message handlers; Optimistic active messages; Optimistic remote procedure calls; Optimized thread package; Programming abstractions; Software overhead; Stub compiler", thesaurus = "Message passing; Remote procedure calls; Scheduling", } @Article{Walter:1995:PMS, author = "Stephen Walter", title = "Put Multiprocessing Systems to Work. {II}", journal = j-UNIX-REVIEW, volume = "13", number = "1", pages = "39--??", month = jan, year = "1995", CODEN = "UNRED5", ISSN = "0742-3136", bibdate = "Sat May 25 07:59:58 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; UnCover library database", abstract = "Programming for multiprocessors requires use of unusual features such as spin locks, mutex locks, barrier synchronization, and the like. Using the POSIX threads API helps, but the rest you have to do yourself.", acknowledgement = ack-nhfb, fjournal = "UNIX review", } @Article{Wayner:1995:FAN, author = "Peter Wayner", title = "Free Agents: a new generation of light-weight, multithreaded operating environments provide security and interoperability for agent developers", journal = j-BYTE, volume = "20", number = "3", pages = "105--??", month = mar, year = "1995", CODEN = "BYTEDJ", ISSN = "0360-5280 (print), 1082-7838 (electronic)", ISSN-L = "0360-5280", bibdate = "Tue Jan 2 10:01:41 MST 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "BYTE Magazine", } @Article{Yam:1995:CFD, author = "Michael Yam", title = "A {C++} Framework for {DCE} Threads", journal = j-DDJ, volume = "20", type = "SB", number = "??", pages = "27--??", month = jul # "\slash " # aug, year = "1995", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Mon Sep 2 09:09:39 MDT 1996", bibsource = "http://www.ddj.com/index/author/index.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @InProceedings{Yasrebi:1995:EDO, author = "M. Yasrebi", title = "Experience with Distributed Objects in a Portable and Multithreaded Library for a {LAN\slash WAN} Gateway Application", crossref = "IEEE:1995:PCL", volume = "20", pages = "164--173", year = "1995", bibdate = "Mon Sep 27 14:16:06 MDT 1999", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, annote = "Also known as LCN'95. IEEE Cat no 95TB100005", keywords = "computer communications; IEEE; LCN; local computer networks", } @Article{Aitken:1996:MCJ, author = "Gary Aitken", title = "Moving from {C++} to {Java}", journal = j-DDJ, volume = "21", number = "3", pages = "52, 54--56", month = mar, year = "1996", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Thu Jan 9 09:35:43 MST 1997", bibsource = "Compendex database; http://www.ddj.com/index/author/index.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; UnCover database", abstract = "Java is claimed to be much easier to learn than C++, but the difficulties most people have in learning to program in both C++ and Java have little to do with language itself. This paper explores some of the differences between Java and C++. The aim is to make user aware of potential problems and opportunities when moving from C++ to Java. Brief explanations are provided for those concepts that until now unfamiliar for many users.", acknowledgement = ack-nhfb, affiliation = "Integrated Computer Solutions", classification = "721.1; 722.2; 723.1; 723.1.1; 723.2", fjournal = "Dr. Dobb's Journal of Software Tools", journalabr = "Dr Dobb's J Software Tools Prof Program", keywords = "C (programming language); Character arrays; Character sets; Data structures; File organization; Garbage collected language; Header files; Interfaces (COMPUTER); Java; Machine code; Member function; Multithreading; Object oriented programming; Pointers; Program compilers; Program interpreters; Program processors; Program translators; Programming theory; Software engineering; Synchronization; Virtual machine", pagecount = "4", } @InProceedings{Amrhein:1996:CSM, author = "Beatrice Amrhein and Oliver Gloor and Wolfgang K{\"u}chlin", title = "A Case Study of Multi-Threaded {Gr{\"o}bner} Basis Completion", crossref = "LakshmanYN:1996:IPI", pages = "95--102", year = "1996", bibdate = "Thu Mar 12 08:43:16 MST 1998", bibsource = "http://www.acm.org/pubs/toc/; https://www.math.utah.edu/pub/tex/bib/issac.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/proceedings/issac/236869/p95-amrhein/", acknowledgement = ack-nhfb, keywords = "algebraic computation; algorithms; experimentation; ISSAC; performance; SIGNUM; SIGSAM; symbolic computation", subject = "{\bf I.1.3} Computing Methodologies, SYMBOLIC AND ALGEBRAIC MANIPULATION, Languages and Systems, Special-purpose algebraic systems. {\bf D.1.3} Software, PROGRAMMING TECHNIQUES, Concurrent Programming, Parallel programming. {\bf C.1.2} Computer Systems Organization, PROCESSOR ARCHITECTURES, Multiple Data Stream Architectures (Multiprocessors), Parallel processors**.", } @MastersThesis{Annavaram:1996:BVN, author = "Murali Annavaram", title = "Blocking versus non-blocking: issues and tradeoffs in multithreaded code execution", type = "Thesis ({M.S.})", school = inst-CSU, address = inst-CSU:adr, pages = "viii + 57", year = "1996", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Multiprocessors -- Design and construction; Parallel processing (Electronic computers)", } @Article{Anonymous:1996:WWD, author = "Anonymous", title = "World-wide distributed system using {Java} and the {Internet}", journal = j-IEEE-INT-SYMP-HIGH-PERF-DIST-COMP-PROC, pages = "11--18", year = "1996", CODEN = "PIDCFB", ISSN = "1082-8907", bibdate = "Thu Dec 12 06:31:53 MST 1996", bibsource = "Compendex database; https://www.math.utah.edu/pub/tex/bib/java.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "IEEE catalog number 96TB100069.", acknowledgement = ack-nhfb, affiliation = "California Inst of Technology", affiliationaddress = "CA, USA", classification = "716.1; 722.4; 723; 723.1; 723.1.1", conference = "Proceedings of the 1996 5th IEEE International Symposium on High Performance Distributed Computing", fjournal = "IEEE International Symposium on High Performance Distributed Computing, Proceedings", keywords = "Collaborative environments; Computer networks; Computer programming languages; Computer software; Data communication systems; Distributed computer systems; Internet; Java; Multithreaded objects; Object oriented programming; Program composition; World wide web", meetingaddress = "Syracuse, NY, USA", meetingdate = "Aug 6--9 1996", meetingdate2 = "08/06--09/96", sponsor = "IEEE", } @Article{Arnold:1996:MPJ, author = "K. Arnold and J. Gosling", title = "Multithreaded programming in {Java}", journal = j-WEB-TECHNIQUES, volume = "1", number = "7", pages = "34--40, 42--43", month = oct, year = "1996", CODEN = "WETEFA", ISSN = "1086-556X", bibdate = "Sat Mar 15 08:49:09 MST 1997", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classcodes = "C6150N (Distributed systems software); C6110J (Object-oriented programming); C6140D (High level languages); C6150J (Operating systems)", fjournal = "Web Techniques", keywords = "display; display code; dynamic behaviour; handshaking; interactive program; interrupts; Java; Java object oriented language; multiple; multiprogramming; multithreaded programming; multithreaded system; object-oriented languages; object-oriented programming; operations; parallel programming; polling; problems; real world software; synchronisation; threads; updates; user input", treatment = "P Practical", } @Article{Bellosa:1996:PIL, author = "Frank Bellosa and Martin Steckermeier", title = "The Performance Implications of Locality Information Usage in Shared-Memory Multiprocessors", journal = j-J-PAR-DIST-COMP, volume = "37", number = "1", pages = "113--121", day = "25", month = aug, year = "1996", CODEN = "JPDCER", DOI = "https://doi.org/10.1006/jpdc.1996.0112", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:19:00 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0112/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0112/production/pdf", acknowledgement = ack-nhfb, classification = "C5220P (Parallel architecture); C5440 (Multiprocessing systems); C5470 (Performance evaluation and testing)", corpsource = "Dept. of Comput. Sci. IV, Erlangen-Nurnberg Univ., Germany", fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315", keywords = "cache miss counters; cache storage; evaluation; locality information; memory multiprocessors; parallel architectures; performance; scalable shared-; scheduling decisions; shared memory systems; shared-memory multiprocessors; thread scheduling algorithms", treatment = "P Practical", } @InProceedings{Benson:1996:DMS, author = "G. D. Benson and R. A. Olsson", title = "The design of microkernel support for the {SR} concurrent programming language", crossref = "Szymanski:1996:LCR", pages = "227--240", year = "1996", bibdate = "Sat Sep 28 18:12:58 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/mach.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, affiliation = "Dept. of Comput. Sci., California Univ., Davis, CA, USA", classification = "C6110P (Parallel programming); C6140D (High level languages); C6150J (Operating systems); C6150N (Distributed systems software)", keywords = "Distributed environment; Distributed operating system; Distributed programming; Distributed programming language; Mach microkernel; Message passing; Microkernel; Microkernel support; Minimal kernel; Multithreaded program; Networked operating system; Parallel programming; SR concurrent programming language", thesaurus = "Distributed processing; Message passing; Multiprocessing programs; Network operating systems; Operating system kernels; Parallel languages", } @Article{Berg:1996:HDT, author = "C. Berg", title = "How do threads work and how can {I} create a general-purpose event?", journal = j-DDJ, volume = "21", number = "11", pages = "111--115, 126--127", month = nov, year = "1996", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Sat Mar 15 08:49:09 MST 1997", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classcodes = "C6110J (Object-oriented programming); C6140D (High level languages); C6150J (Operating systems); C6150N (Distributed systems software)", corpsource = "Digital Focus, USA", fjournal = "Dr. Dobb's Journal of Software Tools", keywords = "(computers); application; application program interfaces; applications; event; exception handling; general-purpose event; Internet; Java; Java thread mechanism; languages; lightweight processes; multiprocessor architecture; multithreading; object; object-oriented; object-oriented programming; operating systems; oriented language; programming interface; scheduling; synchronisation; synchronization; thread programming; threads; web", treatment = "P Practical", } @Article{Berg:1996:JQH, author = "Cliff Berg", title = "{Java Q and A}: How do Threads Work and How Can {I} Create a General-Purpose Event?", journal = j-DDJ, volume = "21", number = "11", pages = "111--??", day = "1", month = nov, year = "1996", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Tue Oct 15 08:20:29 1996", bibsource = "http://www.ddj.com/index/author/index.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @InProceedings{Bhandarkar:1996:MPM, author = "M. A. Bhandarkar and L. V. Kale", title = "{MICE}: a prototype {MPI} implementation in {Converse} environment", crossref = "IEEE:1996:PSM", pages = "26--31", year = "1996", bibdate = "Sat Apr 19 16:34:54 MDT 1997", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classification = "C6110P (Parallel programming); C6115 (Programming support); C6150E (General utility programs); C6150N (Distributed systems software)", conftitle = "Proceedings. Second MPI Developer's Conference", corpsource = "Dept. of Comput. Sci., Illinois Univ., Urbana, IL, USA", keywords = "Abstract Device Interface; application program interfaces; communication; computations; Converse interoperable parallel programming environment; message managers; message passing; MICE; MPI modules; MPICH; multi-threaded MPI programs; open systems; parallel programming; programming environments; prototype MPI implementation; public-domain MPI implementation; PVM interoperation; thread objects; utility programs", sponsororg = "IEEE Comput. Soc. Tech. Committee on Distributed Process", treatment = "P Practical", } @Article{Bianchini:1996:EPM, author = "Ricardo Bianchini and Beng-Hong Lim", title = "Evaluating the Performance of Multithreading and Prefetching in Multiprocessors", journal = j-J-PAR-DIST-COMP, volume = "37", number = "1", pages = "83--97", day = "25", month = aug, year = "1996", CODEN = "JPDCER", DOI = "https://doi.org/10.1006/jpdc.1996.0109", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:19:00 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0109/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0109/production/pdf", acknowledgement = ack-nhfb, classification = "C5220P (Parallel architecture); C5440 (Multiprocessing systems); C5470 (Performance evaluation and testing); C6110P (Parallel programming); C6150N (Distributed systems software)", corpsource = "COPPE Syst. Eng., Federal Univ. of Rio de Janeiro, Brazil", fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315", keywords = "cache; memory latency; MIT Alewife multiprocessor; multiprocessing systems; multiprocessors; multithreading; parallel; parallel architectures; performance evaluation; programming; software prefetching; storage management", treatment = "P Practical", } @Article{Blumofe:1996:CEM, author = "Robert D. Blumofe and Christopher F. Joerg and Bradley C. Kuszmaul and Charles E. Leiserson and Keith H. Randall and Yuli Zhou", title = "{Cilk}: An Efficient Multithreaded Runtime System", journal = j-J-PAR-DIST-COMP, volume = "37", number = "1", pages = "55--69", day = "25", month = aug, year = "1996", CODEN = "JPDCER", DOI = "https://doi.org/10.1006/jpdc.1996.0107", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:19:00 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0107/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0107/production/pdf", acknowledgement = ack-nhfb, classification = "C4240P (Parallel programming and algorithm theory); C6110P (Parallel programming)", corpsource = "Lab. for Comput. Sci., MIT, Cambridge, MA, USA", fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315", keywords = "Cilk; critical path analysis; critical-path length; directed acyclic graph; load balancing; multithreaded runtime system; parallel; parallel algorithms; parallel programming; processor scheduling; programming; runtime scheduling; synchronisation", treatment = "P Practical; T Theoretical or Mathematical", } @Article{Bundgen:1996:SCM, author = "Reinhard B{\"u}ndgen and Manfred G{\"o}bel and Wolfgang K{\"u}chlin", title = "Strategy Compliant Multi-Threaded Term Completion", journal = j-J-SYMBOLIC-COMP, volume = "21", number = "4/5/6", pages = "475--506 (or 475--505??)", month = apr # ", " # may # " \& " # jun, year = "1996", CODEN = "JSYCEH", ISSN = "0747-7171 (print), 1095-855X (electronic)", ISSN-L = "0747-7171", MRclass = "68Q42 (68Q22 68Q40)", MRnumber = "1 420 910", bibdate = "Sat May 10 15:54:09 MDT 1997", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "Parallel symbolic computation.", acknowledgement = ack-nhfb, classcodes = "C7310 (Mathematics computing); C5440 (Multiprocessing systems); C4210L (Formal languages and computational linguistics); C6130 (Data handling techniques)", corpsource = "Wilhelm-Schickard-Inst. fur Inf., Tubingen Univ., Germany", fjournal = "Journal of Symbolic Computation", journal-URL = "http://www.sciencedirect.com/science/journal/07477171", keywords = "completion module AC; Knuth--Bendix completion; parallel; parallel architectures; rewriting systems; shared memory; strategy compliant multi-threaded term completion; symbol manipulation; systems; term-rewriting system PaReDuX; unfailing completion", treatment = "A Application; P Practical", } @Article{Chrisochoides:1996:MMD, author = "Nikos Chrisochoides", title = "Multithreaded model for the dynamic load-balancing of parallel adaptive {PDE} computations", journal = j-APPL-NUM-MATH, volume = "20", number = "4", pages = "349--365", day = "3", month = jun, year = "1996", CODEN = "ANMAEL", ISSN = "0168-9274 (print), 1873-5460 (electronic)", ISSN-L = "0168-9274", bibdate = "Wed Jul 28 14:36:24 MDT 1999", bibsource = "Compendex database; http://www.elsevier.com/cgi-bin/cas/tree/store/apnum/cas_free/browse/browse.cgi?year=1996&volume=20&issue=4; https://www.math.utah.edu/pub/tex/bib/applnummath.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.elsevier.com/cgi-bin/cas/tree/store/apnum/cas_sub/browse/browse.cgi?year=1996&volume=20&issue=4&aid=652", acknowledgement = ack-nhfb, affiliation = "Cornell Univ", affiliationaddress = "Ithaca, NY, USA", classification = "722.4; 723.1; 723.5; 731.1; 921.2; 921.6", fjournal = "Applied Numerical Mathematics: Transactions of IMACS", journal-URL = "http://www.sciencedirect.com/science/journal/01689274", journalabr = "Appl Numer Math", keywords = "Calculations; Codes (symbols); Computational complexity; Computer software; Dynamic load balancing; Load balancing algorithms; Mathematical models; Multicomputers; Multithreaded model; Numerical methods; Parallel processing systems; Partial differential equations; Processor workloads; Program complexity; Program processors; Synchronization", } @Article{Drake:1996:IJT, author = "Donald G. Drake", title = "Introduction to {Java} threads", journal = j-JAVAWORLD, volume = "1", number = "2", pages = "??--??", month = apr, year = "1996", CODEN = "????", ISSN = "1091-8906", bibdate = "Thu Aug 13 08:48:26 MDT 1998", bibsource = "http://www.javaworld.com/javaworld/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.javaworld.com/javaworld/jw-04-1996/jw-04-threads.htm", acknowledgement = ack-nhfb, } @Article{Eickemeyer:1996:EMU, author = "Richard J. Eickemeyer and Ross E. Johnson and Steven R. Kunkel and Mark S. Squillante and Shiafun Liu", title = "Evaluation of multithreaded uniprocessors for commercial application environments", journal = j-COMP-ARCH-NEWS, volume = "24", number = "2", pages = "203--212", month = may, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Engelhardt:1996:PIP, author = "Dean Engelhardt and Andrew Wendelborn", title = "A Partitioning-Independent Paradigm for Nested Data Parallelism", journal = j-INT-J-PARALLEL-PROG, volume = "24", number = "4", pages = "291--317", month = aug, year = "1996", CODEN = "IJPPE5", ISSN = "0885-7458 (print), 1573-7640 (electronic)", ISSN-L = "0885-7458", bibdate = "Sat Apr 26 11:36:49 MDT 1997", bibsource = "Compendex database; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, affiliation = "Univ of Adelaide", affiliationaddress = "Aust", classification = "721.1; 722.4; 723.1.1; 723.2; 723.5; C6110P (Parallel programming); C6120 (File organisation); C6150C (Compilers, interpreters and other processors); C6150N (Distributed systems software)", corpsource = "Dept. of Comput. Sci., Adelaide Univ., SA, Australia", fjournal = "International Journal of Parallel Programming", journal-URL = "http://link.springer.com/journal/10766", journalabr = "Int J Parallel Program", keywords = "abstract machine; Computational methods; Computer simulation; costs; data parallel model; data partitioning; Data structures; data structures; High level languages; irregular data structures; Multi threading; multinode execution model; Multiprocessing systems; multiprocessing systems; multiprocessor machines; nested data parallelism; Nested data parallelism; nested data structures; nodal multi-threading; one-dimensional data parallel operator; parallel computation; Parallel execution models; Parallel processing systems; parallel programming; partitioning-independent paradigm; Performance; performance statistics; program compilers; software performance evaluation; Thinking machines; Thinking Machines CM-5", treatment = "P Practical", } @Article{Esposito:1996:MVB, author = "Dino Esposito", title = "Multithreading and {Visual Basic}", journal = j-DDJ, volume = "21", number = "12", pages = "46--??", month = dec, year = "1996", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Sat Mar 07 08:22:15 1998", bibsource = "http://www.ddj.com/index/author/index.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Although Visual Basic does not support native multithreading, it does support the Windows API. This means you can write VB applications composed of two or more threads. Dino shows you how to create multithreaded applications using both the SDK and Visual Basic", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @MastersThesis{Farber:1996:EAM, author = "Philipp Farber", title = "Execution architecture of the multithreaded {ADAM} prototype", type = "Thesis ({doctoral})", number = "13", school = "Swiss Federal Institute of Technology", address = "Zurich, Switzerland", pages = "iv + 127", year = "1996", ISBN = "3-7281-2384-6", ISBN-13 = "978-3-7281-2384-8", LCCN = "????", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = "TIK-Schriftenreihe", acknowledgement = ack-nhfb, keywords = "Computer architecture; Parallel processing (Electronic computers); Parallel programming (Computer science)", } @InProceedings{Farcy:1996:ISP, author = "A. Farcy and O. Temam", title = "Improving Single-Process Performance with Multithreaded Processors", crossref = "ACM:1996:FCP", pages = "350--357", year = "1996", bibdate = "Wed Mar 18 12:33:18 MST 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, annote = "Also known as ICS'96. Held as part of the Federated computing research conference (FCRC'96)", keywords = "ACM; architecture; computer; FCRC; ICS; SIGARCH; supercomputing", } @Article{Fatouron:1996:SAS, author = "P. Fatouron and P. Spirakis", title = "Scheduling Algorithms for Strict Multithreaded Computations", journal = j-LECT-NOTES-COMP-SCI, volume = "1178", pages = "407--??", year = "1996", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Fri Aug 22 11:59:49 MDT 1997", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Feuerstein:1996:MTP, author = "E. Feuerstein and A. S. {De Loma}", title = "On Multi-threaded Paging", journal = j-LECT-NOTES-COMP-SCI, volume = "1178", pages = "417--??", year = "1996", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Fri Aug 22 11:59:49 MDT 1997", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @InProceedings{Foster:1996:MIW, author = "I. Foster and J. Geisler and S. Tuecke", title = "{MPI} on the {I-WAY}: a wide-area, multimethod implementation of the {Message Passing Interface}", crossref = "IEEE:1996:PSM", pages = "10--17", year = "1996", bibdate = "Sat Apr 19 16:34:54 MDT 1997", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classification = "C5620W (Other computer networks); C6110B (Software engineering techniques); C6115 (Programming support); C6130S (Data security); C6150E (General utility programs); C6150N (Distributed systems software)", conftitle = "Proceedings. Second MPI Developer's Conference", corpsource = "Argonne Nat. Lab., IL, USA", keywords = "application program interfaces; authentication; automatic configuration mechanisms; communication mechanisms; geographically distributed computing resources; geographically distributed database resources; geographically distributed graphics resources; geographically distributed networking; heterogeneous systems; high-speed wide-area networks; I-WAY distributed- computing experiment; message authentication; message passing; Message Passing Interface; MPICH; Nexus multithreaded runtime system; parallel programming; portable high-performance programming model; process creation; programming environments; software environment; software libraries; utility programs; wide area networks", sponsororg = "IEEE Comput. Soc. Tech. Committee on Distributed Process", treatment = "P Practical", } @Article{Foster:1996:NAI, author = "Ian Foster and Carl Kesselman and Steven Tuecke", title = "The {Nexus} Approach to Integrating Multithreading and Communication", journal = j-J-PAR-DIST-COMP, volume = "37", number = "1", pages = "70--82", day = "25", month = aug, year = "1996", CODEN = "JPDCER", DOI = "https://doi.org/10.1006/jpdc.1996.0108", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:19:00 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0108/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0108/production/pdf", acknowledgement = ack-nhfb, classification = "C6110P (Parallel programming); C6150C (Compilers, interpreters and other processors); C6150N (Distributed systems software)", corpsource = "Div. of Math. and Comput. Sci., Argonne Nat. Lab., IL, USA", fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315", keywords = "asynchronous messaging; client-server systems; compiler target; data communication; distributed; distributed-memory systems; dynamic; dynamic communication; global memory model; global pointer; mechanism; memory systems; message passing; multithreading; Nexus runtime system; parallel languages; parallel programming; program compilers; remote service request; synchronisation; thread creation", treatment = "P Practical", } @Article{Goldstein:1996:LTI, author = "Seth Copen Goldstein and Klaus Erik Schauser and David E. Culler", title = "Lazy Threads: Implementing a Fast Parallel Call", journal = j-J-PAR-DIST-COMP, volume = "37", number = "1", pages = "5--20", day = "25", month = aug, year = "1996", CODEN = "JPDCER", DOI = "https://doi.org/10.1006/jpdc.1996.0104", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:19:00 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0103/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0103/production/pdf; http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0104/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0104/production/pdf", acknowledgement = ack-nhfb, classification = "C4240P (Parallel programming and algorithm theory); C6120 (File organisation)", corpsource = "Comput. Sci. Div., California Univ., Berkeley, CA, USA", fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315", keywords = "code generation strategy; lazy threads; multithreaded execution models; parallel call; parallel programming; parallel-ready sequential call; storage management", treatment = "T Theoretical or Mathematical", } @MastersThesis{Gollapudi:1996:MCA, author = "Sreenivas Gollapudi", title = "A multithreaded client-server architecture for distributed multimedia systems", type = "Thesis ({M.S.})", school = "Dept. of Computer Science, State University of New York at Buffalo", address = "Buffalo, NY, USA", pages = "viii + 72", year = "1996", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "Also available as technical report 96-13.", acknowledgement = ack-nhfb, keywords = "Electronic data processing -- Distributed processing; Multimedia systems -- Design and construction; Multitasking (Computer science)", } @Article{Grunwald:1996:WPO, author = "Dirk Grunwald and Richard Neves", title = "Whole-Program Optimization for Time and Space Efficient Threads", journal = j-SIGPLAN, volume = "31", number = "9", pages = "50--59", month = sep, year = "1996", CODEN = "SINODQ", ISBN = "0-89791-767-7", ISBN-13 = "978-0-89791-767-4", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sat May 1 15:50:57 MDT 1999", bibsource = "http://www.acm.org/pubs/toc/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "Co-published as SIGOPS Operating Systems Review {\bf 30}(5), December 1996, and as SIGARCH Computer Architecture News, {\bf 24}(special issue), October 1996.", URL = "http://www.acm.org:80/pubs/citations/proceedings/asplos/237090/p50-grunwald/", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "algorithms; design; languages; performance", subject = "{\bf D.3.4} Software, PROGRAMMING LANGUAGES, Processors, Optimization. {\bf C.1.2} Computer Systems Organization, PROCESSOR ARCHITECTURES, Multiple Data Stream Architectures (Multiprocessors), Parallel processors**. {\bf D.1.3} Software, PROGRAMMING TECHNIQUES, Concurrent Programming, Parallel programming.", } @Article{Hamilton:1996:JSN, author = "Marc A. Hamilton", title = "{Java} and the Shift to Net-Centric Computing", journal = j-COMPUTER, volume = "29", number = "8", pages = "31--39", month = aug, year = "1996", CODEN = "CPTRB4", ISSN = "0018-9162 (print), 1558-0814 (electronic)", ISSN-L = "0018-9162", bibdate = "Sat Mar 15 08:49:09 MST 1997", bibsource = "Compendex database; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; UnCover library database", note = "Mentions Java's use of Unicode characters.", abstract = "Java, with its write once, run anywhere model, changes the basic techniques by which software is designed, developed, and deployed.", acknowledgement = ack-nhfb, affiliation = "Sun Microsystems", affiliationaddress = "El Segundo, CA, USA", classcodes = "C6140D (High level languages); C6110J (Object-oriented programming); C7210 (Information services and centres); C6120 (File organisation)", classification = "722.1; 722.3; 723; 723.1; 723.1.1; 723.2; 723.3; 723.5; C6110J (Object-oriented programming); C6120 (File organisation); C6140D (High level languages); C7210 (Information services and centres)", corpsource = "Sun Microsyst., El Segundo, CA, USA", fjournal = "Computer", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2", journalabr = "Computer", keywords = "application program interfaces; application programming; C; C (programming language); C++; computer aided software; Computer architecture; Computer hardware; Computer networks; Computer operating systems; Computer programming languages; Computer simulation; Computer software; Computer software portability; Distributed database systems; Dynamic linking; engineering; environments; garbage collection; interfaces; Internet; Internet, Object oriented programming; interpreted language; Java; Java programming language; language; management; Memory management; Middleware; Middleware, Computer programming languages; multithreading; Multithreading; multithreading; Multithreading; multithreading; Net centric computing; net-centric computing; Network centric computing; Numeric data types; Object oriented programming; object-; object-oriented languages; object-oriented programming; oriented programming; program compiler; Program compilers; program debugging; Program interpreters; program testing; programming environments; Security of data; software development; Software engineering; software-development life cycle; storage; Storage allocation (computer); Virtual machines; Web browser; Web browsers; World Wide Web", treatment = "P Practical", } @Article{Helmbold:1996:TRC, author = "D. P. Helmbold and C. E. McDowell", title = "A Taxonomy of Race Conditions", journal = j-J-PAR-DIST-COMP, volume = "33", number = "2", pages = "159--164", day = "15", month = mar, year = "1996", CODEN = "JPDCER", DOI = "https://doi.org/10.1006/jpdc.1996.0034", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:18:59 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0034/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0034/production/pdf", acknowledgement = ack-nhfb, classification = "C4230 (Switching theory); C4240P (Parallel programming and algorithm theory); C6110P (Parallel programming)", corpsource = "Dept. of Comput. and Inf. Sci., California Univ., Santa Cruz, CA, USA", fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315", keywords = "access; anomalies; hazards and race conditions; multiple threads; nondeterministic behavior; parallel programming; race conditions taxonomy; timing", treatment = "P Practical; T Theoretical or Mathematical", } @Article{Hertzum:1996:BQO, author = "Morten Hertzum and Erik Fr{\o}kj{\ae}r", title = "Browsing and querying in online documentation: a study of user interfaces and the interaction process", journal = j-TOCHI, volume = "3", number = "2", pages = "136--161", month = jun, year = "1996", CODEN = "ATCIF4", ISSN = "1073-0516 (print), 1557-7325 (electronic)", ISSN-L = "1073-0516", bibdate = "Tue Jan 19 05:49:17 MST 1999", bibsource = "http://www.acm.org/pubs/contents/journals/tochi/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/journals/tochi/1996-3-2/p136-hertzum/", abstract = "A user interface study concerning the usage effectiveness of selected retrieval modes was conducted using an experimental text retrieval system, TeSS, giving access to online documentation of certain programming tools. Four modes of TeSS were compared: (1) browsing, (2) conventional boolean retrieval, (3) boolean retrieval based on Venn diagrams, and (4) these three combined. Further, the modes of TeSS were compared to the use of printed manuals. The subjects observed were 87 computing new to them. In the experiment the use of printed manuals is faster and provides answers of higher quality than any of the electronic modes. Therefore, claims about the effectiveness of computer-based text retrieval have to by vary in situations where printed manuals are manageable to the user. Among the modes of TeSS, browsing is the fastest and the one causing the fewest operational errors. On the same two variables, time and operational errors, the Venn diagram mode performs better than conventional boolean retrieval. The combined mode scores worst on the objective performance measures; nonetheless nearly all subject prefer this mode. Concerning the interaction process, the subjects tend to manage the complexities of the information retrieval tasks by issuing series of simple commands and exploiting the interactive capabilities of TeSS. To characterize the dynamics of the interaction process two concepts are introduced; threads and sequences of tactics. Threads in a query sequence describes the continuity during retrieval. Sequences of tactics concern the combined mode and describe how different retrieval modes succeed each other as the retrieval process evolves.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Computer-Human Interaction", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J756", keywords = "experimentation; human factors; performance", subject = "{\bf H.5.2} Information Systems, INFORMATION INTERFACES AND PRESENTATION, User Interfaces, Evaluation/methodology. {\bf H.3.3} Information Systems, INFORMATION STORAGE AND RETRIEVAL, Information Search and Retrieval, Query formulation. {\bf H.3.3} Information Systems, INFORMATION STORAGE AND RETRIEVAL, Information Search and Retrieval, Retrieval models. {\bf H.3.4} Information Systems, INFORMATION STORAGE AND RETRIEVAL, Systems and Software. {\bf H.5.2} Information Systems, INFORMATION INTERFACES AND PRESENTATION, User Interfaces, Training, help, and documentation.", } @MastersThesis{Hudson:1996:MDA, author = "Greg Hudson", title = "Multithreaded design in the {Athena} environment", type = "Thesis ({M. Eng.})", school = "Massachusetts Institute of Technology, Department of Electrical Engineering and Computer Science", address = "Cambridge, MA, USA", pages = "240", year = "1996", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Hum:1996:SEM, author = "Herbert H. J. Hum and Olivier Maquelin and Kevin B. Theobald and Xinmin Tian and Guang R. Gao and Laurie J. Hendren", title = "A Study of the {EARTH-MANNA} Multithreaded System", journal = j-INT-J-PARALLEL-PROG, volume = "24", number = "4", pages = "319--348", month = aug, year = "1996", CODEN = "IJPPE5", ISSN = "0885-7458 (print), 1573-7640 (electronic)", ISSN-L = "0885-7458", bibdate = "Sat Apr 26 11:36:49 MDT 1997", bibsource = "Compendex database; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, affiliation = "Intel Corp", affiliationaddress = "OR, USA", classification = "722.3; 722.4; 723.5; 731.1; C5220P (Parallel architecture); C5440 (Multiprocessing systems); C5470 (Performance evaluation and testing); C6150N (Distributed systems software)", corpsource = "Dept. of Meas., Archit. and Planning, Intel Corp., Hillsboro, OR, USA", fjournal = "International Journal of Parallel Programming", journal-URL = "http://link.springer.com/journal/10766", journalabr = "Int J Parallel Program", keywords = "ASIC synchronization unit; benchmarks; Communication latency; communication latency; Computer architecture; Computer hardware; Computer simulation; Data communication systems; data flow computing; dataflow-like thread synchronizations; earth manna system; EARTH-MANNA multithreaded system; Execution unit; multiprocessing systems; Multiprocessing systems; multiprocessor systems; multithreaded architecture; Multithreaded system; off-the-shelf execution unit; parallel architectures; Parallel processing systems; performance; Performance; performance evaluation; processor scheduling; Program processors; remote requests; Scheduling; scheduling; sequentially-executed code; synchronisation; Synchronization; synchronization; Synchronization unit; uniprocessor performance", treatment = "P Practical", } @Article{Hurson:1996:CMD, author = "A. R. Hurson and Krishna M. Kavi and Behrooz Shirazi and Ben Lee", title = "Cache Memories for Dataflow Systems", journal = j-IEEE-PAR-DIST-TECH, volume = "4", number = "4", pages = "50--64", month = "Winter", year = "1996", CODEN = "IPDTEX", DOI = "https://doi.org/10.1109/88.544436", ISSN = "1063-6552 (print), 1558-1861 (electronic)", ISSN-L = "1063-6552", bibdate = "Mon Jun 7 07:52:29 MDT 1999", bibsource = "Compendex database; https://www.math.utah.edu/pub/tex/bib/ieeepardisttech.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://dlib.computer.org/pd/books/pd1996/pdf/p4050.pdf; http://www.computer.org/concurrency/pd1996/p4050abs.htm", acknowledgement = ack-nhfb, affiliation = "Pennsylvania State Univ", affiliationaddress = "PA, USA", classification = "721.1; 722.1; 722.2; 723; 723.1; 731.1; C5220P (Parallel architecture); C5320G (Semiconductor storage); C5440 (Multiprocessing systems); C6110P (Parallel programming); C6120 (File organisation)", corpsource = "Dept. of Comput. Sci. and Eng., Pennsylvania State Univ., University Park, PA, USA", fjournal = "IEEE Parallel and Distributed Technology: Systems and Applications", journalabr = "IEEE Parallel Distrib Technol", keywords = "Algorithms; architectural model; Buffer storage; cache memories; Cache misses; cache storage; Computer architecture; computer architectures; Computer systems programming; Context switching; control flow architecture; control flow processing; dataflow architectures; dataflow computation; dataflow environment; dataflow processing; dataflow program; dataflow programming environments; Dataflow systems; dataflow systems; localities; Memory latencies; Multithreading; parallel architectures; parallel machines; Parallel processing systems; parallel programming; Process control; Program compilers; Program processors; Sequential switching; Storage allocation (computer); temporal; Throughput; Virtual storage", treatment = "P Practical", } @PhdThesis{Joerg:1996:CSP, author = "Christopher F. (Christopher Frank) Joerg", title = "The {Cilk} system for parallel multithreaded computing", type = "Thesis ({Ph.D.})", school = "Massachusetts Institute of Technology, Department of Electrical Engineering and Computer Science", address = "Cambridge, MA, USA", pages = "199", year = "1996", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Karamcheti:1996:RME, author = "Vijay Karamcheti and John Plevyak and Andrew A. Chien", title = "Runtime Mechanisms for Efficient Dynamic Multithreading", journal = j-J-PAR-DIST-COMP, volume = "37", number = "1", pages = "21--40", day = "25", month = aug, year = "1996", CODEN = "JPDCER", DOI = "https://doi.org/10.1006/jpdc.1996.0105", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:19:00 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0105/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0105/production/pdf", acknowledgement = ack-nhfb, classification = "C4240P (Parallel programming and algorithm theory); C5220P (Parallel architecture); C6150C (Compilers, interpreters and other processors)", corpsource = "Dept. of Comput. Sci., Illinois Univ., Urbana, IL, USA", fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315", keywords = "compiler; distributed memory machines; distributed memory systems; dynamic multithreading; hybrid; Illinois Concert runtime system; parallel; parallel architectures; program compilers; programming; pull messaging; stack-heap; threads", treatment = "P Practical", } @Book{Kleiman:1996:PT, author = "Steve Kleiman and Devang Shah and Bart Smaalders", title = "Programming with threads", publisher = pub-PH, address = pub-PH:adr, pages = "xxviii + 534", year = "1996", ISBN = "0-13-172389-8", ISBN-13 = "978-0-13-172389-4", LCCN = "QA76.58 .K53 1996", bibdate = "Fri May 10 12:18:17 MDT 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "multitasking (computer science); parallel programming (computer science); synchronization", } @Article{Leary:1996:CEH, author = "S. Leary", title = "{C++} exception handling in multithreaded programs", journal = j-C-PLUS-PLUS-REPORT, volume = "8", number = "2", pages = "20--31", month = feb, year = "1996", CODEN = "CRPTE7", ISSN = "1040-6042", bibdate = "Tue Mar 25 13:34:48 MST 1997", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classcodes = "C6110J (Object-oriented programming); C6140D (High level languages); C6150J (Operating systems); C6130 (Data handling techniques)", corpsource = "Dresser-Wayne Ind., USA", fjournal = "C++ Report", keywords = "C language; C++; exception handling; exception-aware thread class; exception-safe programming; lightweight threads; multiprogramming; multitasking; multithreaded programs; object oriented programming; object-; object-oriented programming; operating; oriented languages; OS/2; reusable C++ classes; software reusability; Solaris; systems; systems (computers); thread manager class; thread-safe reference counting class; Windows 95; Windows NT", treatment = "P Practical", } @Book{Lewis:1996:TPG, author = "Bil Lewis and Daniel J. Berg", title = "Threads Primer: a Guide to Multithreaded Programming", publisher = pub-SUNSOFT, address = pub-SUNSOFT:adr, pages = "xxvi + 319", year = "1996", ISBN = "0-13-443698-9", ISBN-13 = "978-0-13-443698-2", LCCN = "QA76.642 .L478 1996", bibdate = "Fri Apr 11 17:06:46 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = "Sun BluePrints Program", acknowledgement = ack-nhfb, keywords = "POSIX (Computer software standard); Threads (Computer programs); UNIX (Computer file)", } @Article{Lim:1996:LPB, author = "Beng-Hong Lim and Ricardo Bianchini", title = "Limits on the performance benefits of multithreading and prefetching", journal = j-SIGMETRICS, volume = "24", number = "1", pages = "37--46", month = may, year = "1996", CODEN = "????", DOI = "https://doi.org/10.1145/233008.233021", ISSN = "0163-5999 (print), 1557-9484 (electronic)", ISSN-L = "0163-5999", bibdate = "Thu Jun 26 11:21:30 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This paper presents new analytical models of the performance benefits of multithreading and prefetching, and experimental measurements of parallel applications on the MIT Alewife multiprocessor. For the first time, both techniques are evaluated on a real machine as opposed to simulations. The models determine the region in the parameter space where the techniques are most effective, while the measurements determine the region where the applications lie. We find that these regions do not always overlap significantly. The multithreading model shows that only 2-4 contexts are necessary to maximize this technique's potential benefit in current multiprocessors. Multithreading improves execution time by less than 10\% for most of the applications that we examined. The model also shows that multithreading can significantly improve the performance of the same applications in multiprocessors with longer latencies. Reducing context-switch overhead is not crucial. The software prefetching model shows that allowing 4 outstanding prefetches is sufficient to achieve most of this technique's potential benefit on current multiprocessors. Prefetching improves performance over a wide range of parameters, and improves execution time by as much as 20-50\% even on current multiprocessors. The two models show that prefetching has a significant advantage over multithreading for machines with low memory latencies and/or applications with high cache miss rates because a prefetch instruction consumes less time than a context-switch.", acknowledgement = ack-nhfb, fjournal = "ACM SIGMETRICS Performance Evaluation Review", journal-URL = "http://portal.acm.org/toc.cfm?id=J618", } @Article{Lowenthal:1996:UFG, author = "David K. Lowenthal and Vincent W. Freeh and Gregory R. Andrews", title = "Using Fine-Grain Threads and Run-Time Decision Making in Parallel Computing", journal = j-J-PAR-DIST-COMP, volume = "37", number = "1", pages = "41--54", day = "25", month = aug, year = "1996", CODEN = "JPDCER", DOI = "https://doi.org/10.1006/jpdc.1996.0106", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:19:00 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0106/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0106/production/pdf", acknowledgement = ack-nhfb, classification = "C5220P (Parallel architecture); C6110P (Parallel programming)C4240P (Parallel programming and algorithm theory)", corpsource = "Dept. of Comput. Sci., Arizona Univ., Tucson, AZ, USA", fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315", keywords = "computing; distributed shared memory; distributed-memory multiprocessors; fine-grain; fine-grain threads; parallel; parallel architectures; parallel programming; parallelism; run-time decision making", treatment = "P Practical", } @Article{Mane:1996:SJP, author = "I. Mane", title = "Survey of the {Java} programming language", journal = j-ELECTRONIK, volume = "45", number = "17", pages = "84--87", day = "20", month = "????", year = "1996", CODEN = "EKRKAR", ISSN = "0013-5658", bibdate = "Sat Mar 15 08:49:09 MST 1997", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classcodes = "C6140D (High level languages); C6150C (Compilers, interpreters and other processors)", countrypub = "Germany", fjournal = "Elektronik", keywords = "fixed; high level languages; Java programming language; memory partitions; multi-threading; program compilers; source code compiler", language = "German", treatment = "G General Review", } @PhdThesis{Mao:1996:PMS, author = "Weihua Mao", title = "Performance modeling of data prefetching and multithreading in scalable multiprocessors", type = "Thesis ({Ph.D.})", school = "University of Southern California", address = "Los Angeles, CA, USA", pages = "xi + 130", year = "1996", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, alttitle = "Performance modeling of data prefetching and multithreading in scalable multiprocessors", } @Article{McManis:1996:JDSa, author = "Chuck McManis", title = "{Java} In Depth: Synchronizing threads in {Java}", journal = j-JAVAWORLD, volume = "1", number = "2", pages = "??--??", month = apr, year = "1996", CODEN = "????", ISSN = "1091-8906", bibdate = "Thu Aug 13 08:48:26 MDT 1998", bibsource = "http://www.javaworld.com/javaworld/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.javaworld.com/javaworld/jw-04-1996/jw-04-synch.htm", acknowledgement = ack-nhfb, } @Article{McManis:1996:JDSb, author = "Chuck McManis", title = "{Java} In Depth: Synchronizing threads in {Java}, {Part II}", journal = j-JAVAWORLD, volume = "1", number = "3", pages = "??--??", month = may, year = "1996", CODEN = "????", ISSN = "1091-8906", bibdate = "Thu Aug 13 08:48:26 MDT 1998", bibsource = "http://www.javaworld.com/javaworld/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.javaworld.com/javaworld/jw-05-1996/jw-05-mcmanis.htm", acknowledgement = ack-nhfb, } @Article{McManis:1996:JDT, author = "Chuck McManis", title = "{Java} In Depth: Threads and applets and visual controls", journal = j-JAVAWORLD, volume = "1", number = "5", pages = "??--??", month = jul, year = "1996", CODEN = "????", ISSN = "1091-8906", bibdate = "Thu Aug 13 08:48:26 MDT 1998", bibsource = "http://www.javaworld.com/javaworld/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.javaworld.com/javaworld/jw-07-1996/jw-07-mcmanis.htm", acknowledgement = ack-nhfb, } @Article{Mikschl:1996:MMS, author = "A. Mikschl and W. Datum", title = "{MSparc}: a Multithreaded {Sparc}", journal = j-LECT-NOTES-COMP-SCI, volume = "1124", pages = "461--??", year = "1996", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Tue Oct 29 14:12:39 MST 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @MastersThesis{Mishra:1996:TIS, author = "Amitabh Mishra", title = "Task and instruction scheduling in parallel multithreaded processors", type = "Thesis ({M.S.})", school = "Department of Computer Science, Texas A\&M University", address = "College Station, TX, USA", pages = "ix + 60", year = "1996", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Major computer science", } @Article{Mitchell:1996:JTM, author = "John D. Mitchell", title = "{Java} Tips: More about threads and the resize problem", journal = j-JAVAWORLD, volume = "1", number = "4", pages = "??--??", month = jun, year = "1996", CODEN = "????", ISSN = "1091-8906", bibdate = "Thu Aug 13 08:48:26 MDT 1998", bibsource = "http://www.javaworld.com/javaworld/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.javaworld.com/javaworld/javatips/jw-javatip9.htm", acknowledgement = ack-nhfb, } @Book{Moore:1996:MPD, author = "Simon W. (Simon William) Moore", title = "Multithreaded processor design", volume = "SECS 358", publisher = pub-KLUWER, address = pub-KLUWER:adr, pages = "xvi + 142", year = "1996", ISBN = "0-7923-9718-5", ISBN-13 = "978-0-7923-9718-2", LCCN = "QA76.5 .M574 1996", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = "The Kluwer international series in engineering and computer science", acknowledgement = ack-nhfb, keywords = "Computer architecture; computer architecture; Computers -- Design; multiprocessors -- design and construction; Multiprocessors -- Design and construction; Parallel computers; parallel computers", } @Book{Nichols:1996:PP, author = "Bradford Nichols and Bick Buttlar and Jackie Proulx Farrell", title = "{Pthreads} Programming", publisher = pub-ORA, address = pub-ORA:adr, pages = "xvi + 267", year = "1996", ISBN = "1-56592-115-1", ISBN-13 = "978-1-56592-115-3", LCCN = "QA76.642.N53 1996", bibdate = "Mon May 11 11:04:53 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", price = "US\$29.95", URL = "http://www.amazon.com/exec/obidos/ASIN/1565921151/ref=sim_books/002-4892305-5599452; http://www.oreilly.com/catalog/pthread", acknowledgement = ack-nhfb, } @Book{Northrup:1996:PUT, author = "Charles J. Northrup", title = "Programming with {UNIX} Threads", publisher = pub-WILEY, address = pub-WILEY:adr, pages = "xv + 399", year = "1996", ISBN = "0-471-13751-0 (paperback)", ISBN-13 = "978-0-471-13751-1 (paperback)", LCCN = "QA76.76.O63 N674 1996", bibdate = "Tue May 25 07:14:38 MDT 1999", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "operating systems (computers); UNIX (computer file)", } @Book{Norton:1996:TTM, author = "Scott J. Norton and Mark D. DiPasquale", title = "Thread time: the multithreaded programming guide", publisher = pub-PH, address = pub-PH:adr, pages = "xx + 538", year = "1996", ISBN = "0-13-190067-6 (paperback)", ISBN-13 = "978-0-13-190067-7 (paperback)", LCCN = "QA76.642.N67 1996", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = "Hewlett--Packard professional books", URL = "http://www.amazon.com/exec/obidos/ASIN/0131900676/ref=sim_books/002-4892305-5599452", acknowledgement = ack-nhfb, annote = "System requirements: IBM compatible PC; CD-ROM drive.", keywords = "Parallel programming (Computer science)", } @Book{Pham:1996:MPW, author = "Thuan Q. Pham and Pankaj K. Garg", title = "Multithreaded programming with {Windows NT}", publisher = pub-PHPTR, address = pub-PHPTR:adr, pages = "xviii + 227", year = "1996", ISBN = "0-13-120643-5", ISBN-13 = "978-0-13-120643-4", LCCN = "QA76.642 .P52 1996", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, annote = "One 3 1/2 in. diskette in pocket inside back cover.", keywords = "Microsoft Windows NT; multiprocessors; Multiprocessors; Parallel programming; parallel programming (computer science); Parallel programming (Computer science)", } @Article{Philbin:1996:TSC, author = "James Philbin and Jan Edler and Otto J. Anshus and Craig C. Douglas and Kai Li", title = "Thread Scheduling for Cache Locality", journal = j-SIGPLAN, volume = "31", number = "9", pages = "60--71", month = sep, year = "1996", CODEN = "SINODQ", ISBN = "0-89791-767-7", ISBN-13 = "978-0-89791-767-4", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:17:23 MST 2003", bibsource = "http://portal.acm.org/; http://www.acm.org/pubs/toc/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "Co-published as SIGOPS Operating Systems Review {\bf 30}(5), December 1996, and as SIGARCH Computer Architecture News, {\bf 24}(special issue), October 1996.", URL = "http://www.acm.org:80/pubs/citations/proceedings/asplos/237090/p60-philbin/", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "algorithms; experimentation; performance", subject = "{\bf D.3.4} Software, PROGRAMMING LANGUAGES, Processors, Optimization. {\bf I.1.2} Computing Methodologies, SYMBOLIC AND ALGEBRAIC MANIPULATION, Algorithms, Algebraic algorithms. {\bf F.2.2} Theory of Computation, ANALYSIS OF ALGORITHMS AND PROBLEM COMPLEXITY, Nonnumerical Algorithms and Problems, Sequencing and scheduling. {\bf F.2.1} Theory of Computation, ANALYSIS OF ALGORITHMS AND PROBLEM COMPLEXITY, Numerical Algorithms and Problems, Computations on matrices. {\bf D.2.2} Software, SOFTWARE ENGINEERING, Design Tools and Techniques, User interfaces.", } @Book{Robbins:1996:PUP, author = "Kay A. Robbins and Steven Robbins", title = "Practical {UNIX} programming: a guide to concurrency, communication, and multithreading", publisher = pub-PHPTR, address = pub-PHPTR:adr, pages = "xiv + 658", year = "1996", ISBN = "0-13-443706-3", ISBN-13 = "978-0-13-443706-4", LCCN = "QA76.76.O63 R615 1996", bibdate = "Tue May 25 07:14:38 MDT 1999", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Microcomputers -- Programming; Operating systems; UNIX (Computer file)", } @Article{Roh:1996:GOE, author = "Lucas Roh and Walid A. Najjar and Bhanu Shankar and A. P. Wim B{\"o}hm", title = "Generation, Optimization, and Evaluation of Multithreaded Code", journal = j-J-PAR-DIST-COMP, volume = "32", number = "2", pages = "188--204", day = "1", month = feb, year = "1996", CODEN = "JPDCER", DOI = "https://doi.org/10.1006/jpdc.1996.0013", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:18:59 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0013/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0013/production/pdf", acknowledgement = ack-nhfb, classification = "C1180 (Optimisation techniques); C4230M (Multiprocessor interconnection); C5220P (Parallel architecture); C6110P (Parallel programming); C6150C (Compilers, interpreters and other processors); C6150N (Distributed systems software)", corpsource = "Dept. of Comput. Sci., Colorado State Univ., Fort Collins, CO, USA", fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315", keywords = "architectures; code generation scheme; compiler intermediate; form; global bottom-up optimization technique; inputs; instruction level; intrathread locality; latency tolerance; multiprocessor interconnection networks; multithreaded; multithreaded code; multithreaded code evaluation; multithreaded code generation; multithreaded computation model; multithreaded synchronization; optimisation; optimising compilers; parallel; parallel architectures; parallelising compilers; parallelism; Pebbles; processor scheduling; processor utilization; program level; programming; reduced instruction set computing; scalability; synchronisation; synchronization costs; top-down code generation", treatment = "T Theoretical or Mathematical", } @Article{Ruddock:1996:MPG, author = "David E. Ruddock and Balakrishnan Dasarathy", title = "Multithreading Programs: Guidelines for {DCE} Applications", journal = j-IEEE-SOFTWARE, volume = "13", number = "1", pages = "80--90", month = jan, year = "1996", CODEN = "IESOEG", ISSN = "0740-7459 (print), 0740-7459 (electronic)", ISSN-L = "0740-7459", bibdate = "Sat Jan 25 07:35:26 MST 1997", bibsource = "Compendex database; https://www.math.utah.edu/pub/tex/bib/ieeesoft.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, affiliation = "Bellcore", affiliationaddress = "Piscataway, NJ, USA", classification = "722.2; 722.4; 723.1; 723.2; 723.3", fjournal = "IEEE Software", journal-URL = "http://www.computer.org/portal/web/csdl/magazines/software", journalabr = "IEEE Software", keywords = "Application programming interfaces; Client server architecture; Computer aided software engineering; Computer operating systems; Computer programming languages; Concurrency control; Data communication systems; Data structures; Distributed computer systems; Distributed computing environment; Multithreading; Network services; Remote procedure call; Security of data; Synchronization; Telecommunication services; User interfaces", } @InProceedings{Sah:1996:PIS, author = "A. Sah and K. Brown and E. Brewer", title = "Programming the {Internet} from the server-side with {Tcl} and {Audience1}", crossref = "USENIX:1996:ATT", pages = "235--??, 183--188", year = "1996", bibdate = "Sat Mar 15 08:49:09 MST 1997", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classcodes = "C6150N (Distributed systems software); C6115 (Programming support); C6110 (Systems analysis and programming); C6140D (High level languages); C7230 (Publishing and reproduction); C7250N (Front end systems for online searching)", conflocation = "Monterey, CA, USA; 10--13 July 1996", conftitle = "Proceedings of 4th Annual Tcl/Tk Workshop '96", corpsource = "Inktomi Corp., Berkeley, CA, USA", keywords = "applications; Audience1; authoring languages; client-server; client-server systems; client-side languages; electronic; end-; extension library; HotBot search engine; HotWired; Inktomi; Internet; mass customization features; MTtcl; multi-threaded Tcl; online front-ends; programming; publishing; server languages; server-side Internet programming; software libraries; to-end publishing tool; World Wide Web", treatment = "P Practical", } @Article{Schmidt:1996:CAPa, author = "D. C. Schmidt and S. Vinoski", title = "Comparing alternative programming techniques for multithreaded servers", journal = j-C-PLUS-PLUS-REPORT, volume = "8", number = "2", pages = "50--59", month = feb, year = "1996", CODEN = "CRPTE7", ISSN = "1040-6042", bibdate = "Tue Mar 25 13:34:48 MST 1997", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classcodes = "C6150N (Distributed systems software); C6110J (Object- oriented programming); C6160 (Database management systems (DBMS)); C6140D (High level languages)", corpsource = "Washington Univ., St. Louis, MO, USA", fjournal = "C++ Report", keywords = "applications; C; C language; C++; client-server systems; CORBA; database management; desktop client; financial data processing; investment brokers; languages; multithreaded servers; multithreaded systems; object-oriented; object-oriented programming; programming; query processing; stock prices; stock quote database; synchronization; systems; wrappers", treatment = "P Practical", } @Article{Schmidt:1996:CAPb, author = "D. C. Schmidt and S. Vinoski", title = "Comparing alternative programming techniques for multithreaded {CORBA} servers", journal = j-C-PLUS-PLUS-REPORT, volume = "8", number = "4", pages = "56--66", month = apr, year = "1996", CODEN = "CRPTE7", ISSN = "1040-6042", bibdate = "Tue Mar 25 13:34:48 MST 1997", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classcodes = "C6110J (Object-oriented programming); C6110P (Parallel programming); C6140D (High level languages)", corpsource = "Washington Univ., St. Louis, MO, USA", fjournal = "C++ Report", keywords = "C language; complexity; distributed multithreaded applications; multithreaded CORBA servers; object-oriented programming; parallel; programming; programming techniques", treatment = "P Practical", } @Article{Schmidt:1996:CAPc, author = "D. C. Schmidt and S. Vinoski", title = "Comparing alternative programming techniques for multithreaded {CORBA} servers", journal = j-C-PLUS-PLUS-REPORT, volume = "8", number = "7", pages = "47--56", month = jul, year = "1996", CODEN = "CRPTE7", ISSN = "1040-6042", bibdate = "Tue Mar 25 13:34:48 MST 1997", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classcodes = "C6110J (Object-oriented programming); C6150N (Distributed systems software); C5690 (Other data communication equipment and techniques); C6110P (Parallel programming)", corpsource = "Washington Univ., St. Louis, MO, USA", fjournal = "C++ Report", keywords = "alternative programming techniques; C; C++ wrappers; concurrency model; CORBA; multithreaded CORBA; multithreaded stock quote servers; network servers; object-oriented programming; parallel; programming; servers; thread per request; thread per session model; thread pool", treatment = "P Practical", } @Article{Severance:1996:MOB, author = "Charles Severance and Richard Enbody and Paul Petersen", title = "Managing the Overall Balance of Operating System Threads on a Multiprocessor Using Automatic Self-Allocating Threads ({ASAT})", journal = j-J-PAR-DIST-COMP, volume = "37", number = "1", pages = "106--112", day = "25", month = aug, year = "1996", CODEN = "JPDCER", DOI = "https://doi.org/10.1006/jpdc.1996.0111", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:19:00 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0111/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0111/production/pdf", acknowledgement = ack-nhfb, classification = "C5440 (Multiprocessing systems); C6110P (Parallel programming); C6150J (Operating systems); C6150N (Distributed systems software)", corpsource = "Dept. of Comput. Sci., Michigan State Univ., East Lansing, MI, USA", fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315", keywords = "allocating threads; allocation; automatic self-; multiprocessing system; multiprocessing systems; operating system; operating systems (computers); parallel programming; processor scheduling; run-time environment; self-scheduling; thread; thread scheduling", treatment = "P Practical; X Experimental", } @Article{Sigmund:1996:IBM, author = "U. Sigmund and T. Ungerer", title = "Identifying Bottlenecks in a Multithreaded Superscalar Microprocessor", journal = j-LECT-NOTES-COMP-SCI, volume = "1124", pages = "797--??", year = "1996", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Tue Oct 29 14:12:39 MST 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @InProceedings{Skjellum:1996:TTM, author = "A. Skjellum and B. Protopopov and S. Hebert", title = "A thread taxonomy for {MPI}", crossref = "IEEE:1996:PSM", pages = "50--57", year = "1996", bibdate = "Sat Apr 19 16:34:54 MDT 1997", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classification = "C6110B (Software engineering techniques); C6110F (Formal methods); C6150E (General utility programs); C6150J (Operating systems); C6150N (Distributed systems software)", conftitle = "Proceedings. Second MPI Developer's Conference", corpsource = "Dept. of Comput. Sci., Mississippi State Univ., MS, USA", keywords = "API extensions; application program interfaces; Channel Device; computational unit; fine-grain concurrency; formal specification; message passing; minimal portable thread management; MPI; MPICH; multi-threaded thread-safe ADI; non-thread-safe MPI call semantics; resource container; software portability; synchronisation; synchronization mechanisms; thread models; thread safety; thread taxonomy; user-level mechanism; utility programs; Windows NT version", sponsororg = "IEEE Comput. Soc. Tech. Committee on Distributed Process", treatment = "P Practical", } @Article{Sundaresan:1996:COO, author = "Neelakantan Sundaresan and Dennis Gannon", title = "{Coir}: An Object-Oriented System for Control and Dynamic Data Parallelism", journal = j-J-PAR-DIST-COMP, volume = "37", number = "1", pages = "98--105", day = "25", month = aug, year = "1996", CODEN = "JPDCER", DOI = "https://doi.org/10.1006/jpdc.1996.0110", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:19:00 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0110/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0110/production/pdf", acknowledgement = ack-nhfb, classification = "C4240P (Parallel programming and algorithm theory); C5220P (Parallel architecture); C6110J (Object-oriented programming); C6110P (Parallel programming); C6150N (Distributed systems software)", corpsource = "Applic. Dev. Technol. Inst., IBM Corp., San Jose, CA, USA", fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315", keywords = "C++ library; Coir; distributed memory machines; distributed memory systems; dynamic data parallelism; message passing; message-passing; multithreading; object-oriented; object-oriented system; operating system; parallel; parallel architectures; parallel programming; programming; shared memory systems; symmetric multiprocessors; synchronisation", treatment = "P Practical; T Theoretical or Mathematical", } @Article{Tullsen:1996:ECI, author = "Dean M. Tullsen and Susan J. Eggers and Joel S. Emer and Henry M. Levy and Jack L. Lo and Rebecca L. Stamm", title = "Exploiting choice: instruction fetch and issue on an implementable simultaneous multithreading processor", journal = j-COMP-ARCH-NEWS, volume = "24", number = "2", pages = "191--202", month = may, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @PhdThesis{Tullsen:1996:SM, author = "Dean Michael Tullsen", title = "Simultaneous multithreading", type = "Thesis ({Ph.D.})", school = "University of Washington", address = "Seattle, WA, USA", pages = "vi + 99", year = "1996", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Computer architecture; Parallel processing (Electronic computers)", } @MastersThesis{Verriello:1996:MSM, author = "Anthony Verriello", title = "Memory sharing in multithreaded transaction environments", type = "Thesis ({M.S.})", school = "Hofstra University", address = "Westport, CT, USA", pages = "180", year = "1996", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Memory, Virtual (Computer science); Transaction systems (Computer systems)", } @Article{Vinoski:1996:DCD, author = "S. Vinoski and D. C. Schmidt", title = "Distributed callbacks and decoupled communication in {CORBA}", journal = j-C-PLUS-PLUS-REPORT, volume = "8", number = "9", pages = "48--56, 77", month = oct, year = "1996", CODEN = "CRPTE7", ISSN = "1040-6042", bibdate = "Tue Mar 25 13:34:48 MST 1997", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classcodes = "C6150N (Distributed systems software); C6110J (Object- oriented programming)", corpsource = "Hewlett--Packard's Distributed Comput. Program, Chelmsford, MA, USA", fjournal = "C++ Report", keywords = "client-server systems; client/server; concurrency control; concurrency models; consumers; CORBA; decoupled communication; decoupled peer-to-peer; distributed callbacks; distributed object computing systems; distributed stock quoting; multithreaded; object-oriented; OMG Events object service; programming; relationships; request communication; response communication; server applications; suppliers; systems", treatment = "P Practical", } @Article{Vlassov:1996:AMM, author = "V. Vlassov and L.-E. Thorelli", title = "Analytical Models of Multithreading with Data Prefetching", journal = j-LECT-NOTES-COMP-SCI, volume = "1124", pages = "714--??", year = "1996", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Tue Oct 29 14:12:39 MST 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Wise:1996:SDP, author = "David S. Wise and Joshua Walgenbach", title = "Static and dynamic partitioning of pointers as links and threads", journal = j-SIGPLAN, volume = "31", number = "6", pages = "42--49", month = jun, year = "1996", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:17:20 MST 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, affiliation = "Dept. of Comput. Sci., Indiana Univ., Bloomington, IN, USA", fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Wismuller:1996:IDP, author = "Roland Wism{\"u}ller and Michael Oberhuber and Johann Krammer and Olav Hansen", title = "Interactive debugging and performance analysis of massively parallel applications", journal = j-PARALLEL-COMPUTING, volume = "22", number = "3", pages = "415--442", day = "29", month = apr, year = "1996", CODEN = "PACOEJ", ISSN = "0167-8191 (print), 1872-7336 (electronic)", ISSN-L = "0167-8191", bibdate = "Fri Aug 6 10:14:54 MDT 1999", bibsource = "Compendex database; http://www.elsevier.com/cgi-bin/cas/tree/store/parco/cas_free/browse/browse.cgi?year=1996&volume=22&issue=3; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.elsevier.com/cgi-bin/cas/tree/store/parco/cas_sub/browse/browse.cgi?year=1996&volume=22&issue=3&aid=1049", acknowledgement = ack-nhfb, affiliation = "Inst f{\"u}r Informatik der Technischen Universit{\"a}t M{\"u}nchen", affiliationaddress = "M{\"u}nchen, Ger", classification = "722.2; 722.4; 723.1; 723.2; 723.5; C6110P (Parallel programming); C6115 (Programming support); C6150G (Diagnostic, testing, debugging and evaluating systems)", corpsource = "Inst. f{\"u}r Inf., Tech. Univ. M{\"u}nchen, Germany", fjournal = "Parallel Computing", journal-URL = "http://www.sciencedirect.com/science/journal/01678191", journalabr = "Parallel Comput", keywords = "applications; attributed measurements; Codes (symbols); Computer debugging; Computer programming; Computer simulation; debugger; debugging; DETOP; Distributed computer systems; distributed evaluation; Distributed online monitoring system; environments; Interactive computer systems; Interactive debugging; intrusion; massively parallel; Massively parallel applications; minimal; monitoring system; multithreaded programming models; Online systems; parallel; Parallel debugger; Parallel processing systems; parallel programming; Parallelization; PATOP; Performance; performance analysis; Performance analysis; performance analyzer; performance bottlenecks; Personal computers; PowerPC; program debugging; programming; scalability; software; software performance evaluation; Supercomputers; tools; usability; User interfaces", treatment = "P Practical", } @Article{Yam:1996:DPV, author = "Michael Yam", title = "{DCE} Pthreads versus {NT} Threads. {Michael} ports {PTF}, a {C++} class library for {DCE} pthreads, from {HP-UX System 9} to {Windows NT}. {In} doing so, he examines the differences between pthreads and {NT} threads, and describes the porting experience", journal = j-DDJ, volume = "21", number = "12", pages = "16--??", month = dec, year = "1996", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Mon Dec 2 07:52:21 MST 1996", bibsource = "http://www.ddj.com/index/author/index.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @Article{Yoo:1996:CAA, author = "H. Chuck Yoo", title = "Comparative Analysis of Asynchronous {I/O} in Multithreaded {UNIX}", journal = j-SPE, volume = "26", number = "9", pages = "987--997", month = sep, year = "1996", CODEN = "SPEXBL", ISSN = "0038-0644 (print), 1097-024X (electronic)", ISSN-L = "0038-0644", bibdate = "Thu Jul 29 15:11:03 MDT 1999", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/spe.bib; http://www3.interscience.wiley.com/journalfinder.html", URL = "http://www3.interscience.wiley.com/cgi-bin/abstract?ID=16832", acknowledgement = ack-nhfb, fjournal = "Software --- Practice and Experience", journal-URL = "http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1097-024X", } @PhdThesis{Yoo:1996:PCM, author = "Namhoon Yoo", title = "Parallelism control in multithreaded multiprocessors", type = "Thesis ({Ph.D.})", school = "University of Southern California", address = "Los Angeles, CA, USA", pages = "x + 86", year = "1996", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Computer architecture; Data flow computing; Multiprocessors; Parallel processing (Electronic computers)", } @Book{Zignin:1996:TDM, author = "Bernard Zignin", title = "Techniques du multithread: du parall{\`e}lisme dans les processus {(French) [Multithreading techniques: parallelism in processes]}", publisher = pub-HERMES, address = pub-HERMES:adr, pages = "72", year = "1996", ISBN = "2-86601-562-2", ISBN-13 = "978-2-86601-562-6", LCCN = "????", bibdate = "Wed Dec 09 23:36:26 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = "CNAM. Syntheses informatiques", acknowledgement = ack-nhfb, keywords = "Parall{\`e}lisme (informatique)", language = "French", } @Article{Anonymous:1997:NPW, author = "Anonymous", title = "New Products: {WebThreads 1.0.1; QUERYFLEX Report Writer; Linux Pro Desktop 1.0; NDP Fortran for Linux; Numerics and Visualization for Java; Craftworks Linux/AXP 2.2; InfoDock Linux Software Development Toolset; Caldera Wabi 2.2 for Linux}", journal = j-LINUX-J, volume = "34", pages = "??--??", month = feb, year = "1997", CODEN = "LIJOFX", ISSN = "1075-3583 (print), 1938-3827 (electronic)", ISSN-L = "1075-3583", bibdate = "Fri Oct 9 08:35:26 MDT 1998", bibsource = "http://noframes.linuxjournal.com/lj-issues/issue34/index.html; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Linux journal", journal-URL = "http://portal.acm.org/citation.cfm?id=J508", } @Article{Anonymous:1997:TNR, author = "Anonymous", title = "Technology News \& Reviews: {Chemkin} software; {OpenMP Fortran Standard}; {ODE} Toolbox for {Matlab}; {Java} products; {Scientific WorkPlace 3.0}", journal = j-IEEE-COMPUT-SCI-ENG, volume = "4", number = "4", pages = "75--??", month = oct # "\slash " # dec, year = "1997", CODEN = "ISCEE4", ISSN = "1070-9924 (print), 1558-190X (electronic)", ISSN-L = "1070-9924", bibdate = "Sat Jan 9 08:57:23 MST 1999", bibsource = "https://www.math.utah.edu/pub/tex/bib/java.bib; https://www.math.utah.edu/pub/tex/bib/matlab.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://dlib.computer.org/cs/books/cs1997/pdf/c4075.pdf", acknowledgement = ack-nhfb, fjournal = "IEEE Computational Science \& Engineering", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=99", } @Article{Anonymous:1997:TWP, author = "Anonymous", title = "Tech Watch --- Pattern-recognition system. {Piecing} together history. {3D} semiconductor simulation. {Multi}-threaded architecture", journal = j-CG-WORLD, volume = "20", number = "9", pages = "15--??", month = sep, year = "1997", CODEN = "CGWODH", ISSN = "0271-4159", bibdate = "Sat Nov 7 10:32:27 MST 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Computer Graphics World", } @Article{Arvind:1997:MSC, author = "Arvind and A. Caro and J.-W. Maessen and S. Aditya", title = "A Multithreaded Substrate and Compilation Model for the Implicitly Parallel Language {pH}", journal = j-LECT-NOTES-COMP-SCI, volume = "1239", pages = "519--??", year = "1997", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Fri Aug 22 11:59:49 MDT 1997", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Bednorz:1997:CDA, author = "M. Bednorz and A. Gwozdowski and K. Zieli{\'n}ski", title = "Contextual debugging and analysis of multithreaded applications", journal = j-CPE, volume = "9", number = "2", pages = "123--139", month = feb, year = "1997", CODEN = "CPEXEI", ISSN = "1040-3108", ISSN-L = "1040-3108", bibdate = "Tue Sep 7 06:06:28 MDT 1999", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www3.interscience.wiley.com/journalfinder.html", URL = "http://www3.interscience.wiley.com/cgi-bin/abstract?ID=13852; http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=13852&PLACEBO=IE.pdf", acknowledgement = ack-nhfb, fjournal = "Concurrency, practice and experience", } @Book{Beveridge:1997:MAW, author = "Jim Beveridge and Robert Wiener", title = "Multithreading applications in {Win32}: the complete guide to threads", publisher = pub-AWDP, address = pub-AWDP:adr, pages = "xviii + 368", year = "1997", ISBN = "0-201-44234-5 (pb) 0-201-18385-4 (CD-ROM)", ISBN-13 = "978-0-201-44234-2 (pb) 978-0-201-18385-6 (CD-ROM)", LCCN = "QA76.76.O63 B478 1997", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, annote = "System requirements: IBM compatible PC; Win32; Windows NT or Windows 95; CD-ROM drive.", keywords = "Microsoft Win32; Microsoft Windows (Computer file); Microsoft Windows NT; Operating systems (Computers)", } @Article{Bik:1997:JPJ, author = "Aart J. C. Bik and Juan E. Villacis and Dennis B. Gannon", title = "javar: a prototype {Java} restructuring compiler", journal = j-CPE, volume = "9", number = "11", pages = "1181--1191", month = nov, year = "1997", CODEN = "CPEXEI", ISSN = "1040-3108", ISSN-L = "1040-3108", bibdate = "Tue Sep 7 06:06:35 MDT 1999", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www3.interscience.wiley.com/journalfinder.html", note = "Special Issue: Java for computational science and engineering --- simulation and modeling II.", URL = "http://www3.interscience.wiley.com/cgi-bin/abstract?ID=13819; http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=13819&PLACEBO=IE.pdf", acknowledgement = ack-nhfb, classification = "C6110J (Object-oriented programming); C6110P (Parallel programming); C6150C (Compilers, interpreters and other processors)", conflocation = "Las Vegas, NV, USA; 21 June 1997", conftitle = "Java for Computational Science and Engineering --- Simulation and Modeling II", corpsource = "Dept. of Comput. Sci., Indiana Univ., Bloomington, IN, USA", fjournal = "Concurrency, practice and experience", keywords = "annotations; explicit parallelism; functionality; implicit parallelism; Java program parallelization; Java restructuring compiler; javar; multi-threading; object-oriented languages; parallelising compilers; prototype; semantic analysis; software prototyping", pubcountry = "UK", sponsororg = "ACM", treatment = "P Practical", } @Article{Bordawekar:1997:EEH, author = "Rajesh Bordawekar and Steven Landherr and Don Capps and Mark Davis", title = "Experimental evaluation of the {Hewlett--Packard} {Exemplar} file system", journal = j-SIGMETRICS, volume = "25", number = "3", pages = "21--28", month = dec, year = "1997", CODEN = "????", DOI = "https://doi.org/10.1145/270900.270904", ISSN = "0163-5999 (print), 1557-9484 (electronic)", ISSN-L = "0163-5999", bibdate = "Thu Jun 26 11:24:50 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This article presents results from an experimental evaluation study of the HP Exemplar file system. Our experiments consist of simple micro-benchmarks that study the impact of various factors on the file system performance. These factors include I/O request/buffer sizes, vectored/non-vectored access patterns, read-ahead policies, multi-threaded (temporally irregular) requests, and architectural issues (cache parameters, NUMA behavior, etc.). Experimental results indicate that the Exemplar file system provides high I/O bandwidth, both for single- and multi-threaded applications. The buffer cache, with prioritized buffer management and large buffer sizes, is effective in exploiting temporal and spatial access localities. The performance of non-contiguous accesses can be improved by either using vectored I/O interfaces or tuning the read-ahead facilities. The file system performance depends on the relative locations of the computing threads and the file system, and also on various Exemplar design parameters such as the NUMA architecture, TLB/data cache management and paging policies.", acknowledgement = ack-nhfb, fjournal = "ACM SIGMETRICS Performance Evaluation Review", journal-URL = "http://portal.acm.org/toc.cfm?id=J618", } @Article{Bramley:1997:TNRb, author = "Randall Bramley", title = "Technology News \& Reviews: {Chemkin} software; {OpenMP Fortran Standard}; {ODE} Toolbox for {Matlab}; {Java} products; {Scientific WorkPlace 3.0}", journal = j-IEEE-COMPUT-SCI-ENG, volume = "4", number = "4", pages = "75--78", month = oct # "\slash " # dec, year = "1997", CODEN = "ISCEE4", ISSN = "1070-9924 (print), 1558-190X (electronic)", ISSN-L = "1070-9924", bibdate = "Sat Jan 9 08:57:23 MST 1999", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputscieng.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://dlib.computer.org/cs/books/cs1997/pdf/c4075.pdf", acknowledgement = ack-nhfb, fjournal = "IEEE Computational Science \& Engineering", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=99", remark = "No DOI available: article missing from IEEE Xplore database.", } @Book{Butenhof:1997:PPT, author = "David R. Butenhof", title = "Programming with {POSIX} threads", publisher = pub-AW, address = pub-AW:adr, pages = "xviii + 381", year = "1997", ISBN = "0-201-63392-2", ISBN-13 = "978-0-201-63392-4", LCCN = "QA76.76.T55B88 1997", bibdate = "Mon Sep 01 08:53:12 1997", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", price = "US\$31.95", URL = "http://www.amazon.com/exec/obidos/ASIN/0201633922/ref=sim_books/002-4892305-5599452", acknowledgement = ack-nhfb, } @Article{Calcote:1997:TPS, author = "John Calcote", title = "Thread Pools and Server Performance", journal = j-DDJ, volume = "22", number = "7", pages = "60--??", month = jul, year = "1997", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Sat Jun 28 10:43:47 MDT 1997", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @Article{Cenciarelli:1997:SMJ, author = "P. Cenciarelli and A. Knapp and B. Reus and M. Wirsing", title = "From sequential to multi-threaded {Java}: An event-based operational semantics", journal = j-LECT-NOTES-COMP-SCI, volume = "1349", pages = "75--??", year = "1997", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Tue Apr 28 08:51:33 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/java.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Cenciarelli:1997:SMT, author = "P. Cenciarelli and A. Knapp and B. Reus and M. Wirsing", title = "From sequential to multi-threaded {Java}: An event-based operational semantics", journal = j-LECT-NOTES-COMP-SCI, volume = "1349", pages = "75--??", year = "1997", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Tue Apr 28 08:51:33 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @InProceedings{Dou:1997:ISV, author = "Yong Dou and Zhengbing Pang and Xingming Zhou", title = "Implementing a software virtual shared memory on {PVM}", crossref = "IEEE:1997:APD", pages = "??--??", year = "1997", bibdate = "Wed Apr 16 06:39:19 MDT 1997", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classification = "C6110P (Parallel programming); C6115 (Programming support); C6120 (File organisation); C6140D (High level languages); C7430 (Computer engineering)", corpsource = "Dept. of Comput. Sci., Changsha Inst. of Technol., Hunan, China", keywords = "distributed; FORTRAN; FORTRAN language; GKD-VSM; memory environments; multithread scheme; parallel programming; parallel programming model; Prefetch and Poststore; programming environments; PVM; shared memory; software overhead; software virtual shared memory; synchronisation; user-level; virtual machines; virtual storage", treatment = "P Practical", } @Article{Eggers:1997:SMP, author = "Susan J. Eggers and Joel S. Emer and Henry M. Levy and Jack L. Lo and Rebecca L. Stamm and Dean M. Tullsen", title = "Simultaneous Multithreading: a Platform for Next-Generation Processors", journal = j-IEEE-MICRO, volume = "17", number = "5", pages = "12--19", month = sep # "\slash " # oct, year = "1997", CODEN = "IEMIDZ", DOI = "https://doi.org/10.1109/40.621209", ISSN = "0272-1732 (print), 1937-4143 (electronic)", ISSN-L = "0272-1732", bibdate = "Thu Dec 14 06:08:58 MST 2000", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeemicro.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; Science Citation Index database (1980--2000)", URL = "http://dlib.computer.org/mi/books/mi1997/pdf/m5012.pdf; http://www.computer.org/micro/mi1997/m5012abs.htm", acknowledgement = ack-nhfb, fjournal = "IEEE Micro", journal-URL = "http://www.computer.org/csdl/mags/mi/index.html", } @TechReport{Eickemeyer:1997:EMP, author = "Richard J. Eickemeyer", title = "Evaluation of multithreaded processors and thread-switch policies", type = "Research report", number = "RC 20956 (92759)", institution = "IBM T. J. Watson Research Center", address = "Yorktown Heights, NY, USA", pages = "16", day = "18", month = aug, year = "1997", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This paper examines the use of coarse-grained multithreading to lessen the negative impact of memory access latencies on the performance of uniprocessor on-line transaction processing systems. It considers the effect of switching threads on cache misses in a two-level cache system. It also examines several different thread-switch policies. The results suggest that multithreading with a small number (3-5) of active threads can significantly improve the performance of such commercial environments.", acknowledgement = ack-nhfb, keywords = "Cache memory; Computer architecture; Threads (Computer programs)", } @Article{Emerson:1997:USW, author = "E. A. Emerson and A. P. Sistla", title = "Utilizing Symmetry when Model-Checking under Fairness Assumptions: An Automata-Theoretic Approach", journal = j-TOPLAS, volume = "19", number = "4", pages = "617--638", month = jul, year = "1997", CODEN = "ATPSDT", ISSN = "0164-0925 (print), 1558-4593 (electronic)", ISSN-L = "0164-0925", bibdate = "Wed Dec 3 16:28:05 MST 1997", bibsource = "http://www.acm.org/pubs/toc/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/journals/toplas/1997-19-4/p617-emerson/", abstract = "One useful technique for combating the state explosion problem is to exploit symmetry when performing temporal logic model checking. In previous work it is shown how, using some basic notions of group theory, symmetry may be exploited for the full range of correctness properties expressible in the very expressive temporal logic CTL*. Surprisingly, while fairness properties are readily expressible in CTL*, these methods are not powerful enough to admit any amelioration of state explosion, when fairness assumptions are involved. We show that it is nonetheless possible to handle fairness efficiently by trading some group theory for automata theory. Our automata-theoretic approach depends on detecting fair paths subtly encoded in a quotient structure whose arcs are annotated with permutations, by using a threaded structure that reflects coordinate shifts caused by the permutations.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Programming Languages and Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783", keywords = "design; languages; theory; verification", subject = "{\bf F.3.1} Theory of Computation, LOGICS AND MEANINGS OF PROGRAMS, Specifying and Verifying and Reasoning about Programs. {\bf F.1.1} Theory of Computation, COMPUTATION BY ABSTRACT DEVICES, Models of Computation. {\bf D.2.4} Software, SOFTWARE ENGINEERING, Software/Program Verification.", } @Article{Fillo:1997:MMM, author = "Marco Fillo and Stephen W. Keckler and William J. Dally and Nicholas P. Carter and Andrew Chang and Yevgeny Gurevich and Whay S. Lee", title = "The {M}-Machine Multicomputer", journal = j-INT-J-PARALLEL-PROG, volume = "25", number = "3", pages = "183--212", month = jun, year = "1997", CODEN = "IJPPE5", ISSN = "0885-7458 (print), 1573-7640 (electronic)", ISSN-L = "0885-7458", bibdate = "Tue Apr 7 18:25:25 MDT 1998", bibsource = "Compendex database; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, affiliation = "Massachusetts Inst of Technology", affiliationaddress = "Cambridge, MA, USA", classification = "714.2; 722; 722.1; 722.4; 723; 723.1", fjournal = "International Journal of Parallel Programming", journal-URL = "http://link.springer.com/journal/10766", journalabr = "Int J Parallel Program", keywords = "Buffer storage; Computer architecture; Data storage equipment; Microprocessor chips; Multiprogramming; Multithread processors; On chip cache; Parallel processing systems; Synchronization; Thread level parallelism; User interfaces", } @MastersThesis{Fisher:1997:SPS, author = "Michael T. Fisher", title = "A study of the performance of simultaneous multithreading on a superscalar processor", type = "Thesis ({M.S.E.E.})", number = "2363", school = "State University of New York at Binghamton, Watson School of Engineering and Applied Science", address = "Binghamton, NY, USA", pages = "vi + 98", year = "1997", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = "Master's theses / State University of New York at Binghamton", acknowledgement = ack-nhfb, alttitle = "Simultaneous multithreading on a superscalar processor Multithreading on a superscalar processor Superscalar processor", keywords = "Microprocessors -- Testing", } @MastersThesis{Fong:1997:BPM, author = "Waipang Fong", title = "Building a preprocessor for a multithreading compiler", type = "Thesis ({M.E.E.})", school = "Department of Electrical Engineering, University of Alabama", address = "Tuscaloosa, AL, USA", pages = "ix + 80", year = "1997", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Multiprocessors; Parallel processing (Electronic computers)", } @Article{Forsell:1997:MMV, author = "M. Forsell", title = "{MTAC} --- a Multithreaded {VLIW} Architecture for {PRAM} Simulation", journal = j-J-UCS, volume = "3", number = "9", pages = "1037--1055", day = "28", month = sep, year = "1997", CODEN = "????", ISSN = "0948-695X (print), 0948-6968 (electronic)", ISSN-L = "0948-6968", bibdate = "Wed Mar 4 15:32:49 MST 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://medoc.springer.de:8000/jucs/jucs_3_9/mtac_a_multithreaded_vliw", acknowledgement = ack-nhfb, fjournal = "J.UCS: Journal of Universal Computer Science", journal-URL = "http://www.jucs.org/jucs", } @Article{Foster:1997:MMC, author = "Ian Foster and Jonathan Geisler and Carl Kesselman and Steven Tuecke", title = "Managing Multiple Communication Methods in High-Performance Networked Computing Systems", journal = j-J-PAR-DIST-COMP, volume = "40", number = "1", pages = "35--48", day = "10", month = jan, year = "1997", CODEN = "JPDCER", DOI = "https://doi.org/10.1006/jpdc.1996.1266", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:19:01 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.1266/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.1266/production/pdf; http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.1266/production/ref", acknowledgement = ack-nhfb, classification = "B6150M (Protocols); B6210L (Computer communications); C5440 (Multiprocessing systems); C5470 (Performance evaluation and testing); C5640 (Protocols); C5670 (Network performance)", corpsource = "Div. of Math. and Comput. Sci., Argonne Nat. Lab., IL, USA", fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315", keywords = "Argonne MPICH library; computer networks; computing systems; criteria; heterogeneous networked environment; high-performance networked; message passing; message passing interface; multimethod communication; multiple communication methods; multithreaded runtime system; networked computing environments; Nexus; Nexus-based MPI implementation; performance characteristics; performance evaluation; protocols; remote service request mechanisms; transport mechanisms; user-specified selection", treatment = "P Practical", } @TechReport{Fujita:1997:MPA, author = "Tetsuya Theodore Fujita", title = "A multithreaded processor architecture for parallel symbolic computation", type = "Technical Report", number = "MIT/LCS/TM-338", institution = "Laboratory for Computer Science, Massachusetts Institute of Technology", address = "Cambridge, MA, USA", pages = "71", month = sep, year = "1997", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Computer architecture; Multilisp (Computer program language); Parallel processing (Electronic computers)", } @PhdThesis{Goldstein:1997:LTC, author = "Seth Copen Goldstein", title = "Lazy threads: compiler and runtime structures for fine-grained parallel programming", type = "Thesis ({Ph.D.})", number = "UCB/CSD-97-975", school = "Computer Science Division, University of California, Berkeley", address = "Berkeley, CA, USA", pages = "xi + 174", year = "1997", LCCN = "TK7885.A1 R46 no.97:975", bibdate = "Fri May 10 12:18:17 MDT 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = "Report", acknowledgement = ack-nhfb, } @Article{Gomez:1997:EMU, author = "Juan Carlos Gomez and Vernon Rego and V. S. Sunderam", title = "Efficient Multithreaded User-Space Transport for Network Computing: Design and Test of the {TRAP} Protocol", journal = j-J-PAR-DIST-COMP, volume = "40", number = "1", pages = "103--117", day = "10", month = jan, year = "1997", CODEN = "JPDCER", DOI = "https://doi.org/10.1006/jpdc.1996.1269", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:19:01 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.1269/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.1269/production/pdf; http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.1269/production/ref", acknowledgement = ack-nhfb, classification = "B6150M (Protocols); B6210L (Computer communications); C5620 (Computer networks and techniques); C5640 (Protocols); C6150G (Diagnostic, testing, debugging and evaluating systems); C6150N (Distributed systems software)", corpsource = "Dept. of Comput. Sci., Purdue Univ., West Lafayette, IN, USA", fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315", keywords = "communicating; communication; computer networks; computing; computing nodes; efficient multithreaded user-space transport; high-; low-latency; message passing; multithreaded message-passing libraries; network; nodes; performance distributed computing applications; processing; runtime performance; scalability characteristics; software libraries; software performance evaluation; testing; transaction; transaction-oriented protocol; transport protocols; TRAP protocol design; TRAP protocol testing; TRAP-based communication library; user-space protocol", treatment = "P Practical", } @Article{Goossens:1997:MVC, author = "B. Goossens", title = "A Multithreaded Vector Co-processor", journal = j-LECT-NOTES-COMP-SCI, volume = "1277", pages = "311--??", year = "1997", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Tue Apr 28 08:51:33 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Gorton:1997:GEI, author = "Ian Gorton and Innes E. Jelly", title = "{Guest Editors} Introduction: Software Engineering for Parallel and Distributed Systems: Challenges and Opportunities", journal = j-IEEE-CONCURR, volume = "5", number = "3", pages = "12--15", month = jul # "\slash " # sep, year = "1997", CODEN = "IECMFX", ISSN = "1092-3063 (print), 1558-0849 (electronic)", ISSN-L = "1092-3063", bibdate = "Tue Jan 16 06:04:48 MST 2001", bibsource = "Compendex database; https://www.math.utah.edu/pub/tex/bib/ieeeconcurrency.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://dlib.computer.org/pd/books/pd1997/pdf/p3012.pdf", acknowledgement = ack-nhfb, affiliation = "Commonwealth Science and Industrial Research Organization", affiliationaddress = "Aust", classification = "722; 722.4; 723; 723.1; 723.3", fjournal = "IEEE Concurrency", journalabr = "IEEE Concurrency", keywords = "Computer workstations; Concurrency control; Fault tolerant computer systems; High performance computing; Multithreaded servers; Parallel processing systems; Program debugging; Program diagnostics; Software engineering; World wide web", } @Article{Gunther:1997:MDF, author = "B. K. Gunther", title = "Multithreading with distributed functional units", journal = j-IEEE-TRANS-COMPUT, volume = "46", number = "4", pages = "399--411", month = apr, year = "1997", CODEN = "ITCOB4", DOI = "https://doi.org/10.1109/12.588034", ISSN = "0018-9340 (print), 1557-9956 (electronic)", ISSN-L = "0018-9340", bibdate = "Wed Jul 6 10:06:22 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput1990.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=588034", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Computers", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12", } @Manual{Haines:1997:DLT, author = "Matthew Haines", title = "On designing lightweight threads for substrate software", number = "201645", publisher = pub-NTIS, address = pub-NTIS:adr, pages = "??", year = "1997", LCCN = "DOC NAS 1.26:201645 mf11", bibdate = "Fri May 10 12:18:17 MDT 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "Shipping list number 98-0847-M.", series = "NASA contractor report", acknowledgement = ack-nhfb, keywords = "operating systems (computers); parallel computers; parallel processing (computers); threads", } @Article{Haines:1997:DPP, author = "Matthew Haines and Piyush Mehrotra and David Cronk", title = "Data-parallel programming in a multithreaded environment", journal = j-SCI-PROG, volume = "6", number = "2", pages = "187--200", month = "Summer", year = "1997", CODEN = "SCIPEV", ISSN = "1058-9244 (print), 1875-919X (electronic)", ISSN-L = "1058-9244", bibdate = "Thu Mar 28 12:27:27 MST 2002", bibsource = "Compendex database; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Scientific Programming", journal-URL = "http://iospress.metapress.com/content/1058-9244", } @Article{Haines:1997:OIA, author = "Matthew Haines", title = "An Open Implementation Analysis and Design for Lightweight Threads", journal = j-SIGPLAN, volume = "32", number = "10", pages = "229--242", month = oct, year = "1997", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:17:39 MST 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Book{Hanson:1997:CII, author = "David R. Hanson", title = "{C} Interfaces and Implementations: Techniques for Creating Reusable Software", publisher = pub-AW, address = pub-AW:adr, pages = "xvii + 519", year = "1997", ISBN = "0-201-49841-3", ISBN-13 = "978-0-201-49841-7", LCCN = "QA76.73.C15H37 1997", bibdate = "Fri Feb 27 16:08:11 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", price = "US\$37.95", series = "Addison-Wesley Professional Computing Series", URL = "http://www.cs.princeton.edu/software/cii/", acknowledgement = ack-nhfb, annote = "Multithreading is discussed in Chapter 20.", } @Article{Hendren:1997:CCE, author = "Laurie J. Hendren and Xinan Tang and Yingchun Zhu and Shereen Ghobrial and Guang R. Gao and Xun Xue and Haiying Cai and Pierre Ouellet", title = "Compiling {C} for the {EARTH} Multithreaded Architecture", journal = j-INT-J-PARALLEL-PROG, volume = "25", number = "4", pages = "305--338", month = aug, year = "1997", CODEN = "IJPPE5", ISSN = "0885-7458 (print), 1573-7640 (electronic)", ISSN-L = "0885-7458", bibdate = "Tue Apr 7 18:25:25 MDT 1998", bibsource = "Compendex database; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, affiliation = "McGill Univ", affiliationaddress = "Montreal, Que, Can", classification = "722; 722.4; 723; 723.1; 723.1.1; 723.2", fjournal = "International Journal of Parallel Programming", journal-URL = "http://link.springer.com/journal/10766", journalabr = "Int J Parallel Program", keywords = "C (programming language); Codes (symbols); Computer architecture; earth C programming language; Multithreaded architecture; Parallel processing systems; Program compilers; Program translators", } @Article{Hightower:1997:PDD, author = "Lauren Hightower", title = "Publishing Dynamic Data on the {Internet} --- {Allaire's Cold Fusion} is a development tool that provides access (via the {Web}) to any database the {Web} server can access using {ODBC}. {Cold Fusion} runs as a multithreaded {Windows NT} system service and works with any {ODBC-compliant} database", journal = j-DDJ, volume = "22", number = "1", pages = "70--??", month = jan, year = "1997", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Fri Jan 3 06:17:24 MST 1997", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @Book{Hughes:1997:OOM, author = "Cameron Hughes and Tracey Hughes", title = "Object-oriented multithreading using {C++}", publisher = pub-WILEY, address = pub-WILEY:adr, pages = "xvi + 495", year = "1997", ISBN = "0-471-18012-2 (paperback)", ISBN-13 = "978-0-471-18012-8 (paperback)", LCCN = "QA76.73.C153H84 1997", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, annote = "System requirements: Windows 95, or OS/2 2.0 and above, or UNIX, or system with POSIX pthreads; ANSI/ISO compliant C++ compiler.", keywords = "C++ (Computer program language); POSIX (Computer software standard); Threads (Computer programs)", } @Article{Kacsuk:1997:MIC, author = "P. Kacsuk and M. Amamiya", title = "A Multithreaded Implementation Concept of {Prolog} on {Datarol-II} Machine", journal = j-LECT-NOTES-COMP-SCI, volume = "1336", pages = "91--??", year = "1997", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Tue Apr 28 08:51:33 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Kasperink:1997:CDC, author = "Harold R. Kasperink and John C. Dekker", title = "Concurrent Database Commands and {C++}", journal = j-DDJ, volume = "22", number = "8", pages = "84, 86, 88, 89, 98", month = aug, year = "1997", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Sat Aug 23 07:57:02 1997", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Mapping design problems to programming problems leads to software solutions that are easy to extend and reuse. Our authors explain how they resolved multithreaded porting problems using design patterns. The database they use is Oracle and the database transactions are implemented using Oracle ProC as an embedded database command language.", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @MastersThesis{Khosla:1997:MAT, author = "Samir Khosla", title = "Multithreading the asynchronous trigger processor", type = "Thesis ({M.S.})", school = "University of Florida", address = "Gainesville, FL, USA", pages = "ix + 57", year = "1997", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Kougiouris:1997:PMF, author = "Panos Kougiouris and Marco Framba", title = "A Portable Multithreading Framework", journal = j-CCCUJ, volume = "15", number = "8", pages = "??--??", month = aug, year = "1997", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Wed Aug 20 10:44:42 1997", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @Article{Krieger:1997:HPO, author = "Orran Krieger and Michael Stumm", title = "{HFS}: a Performance-Oriented Flexible File System Based on Building-Block Compositions", journal = j-TOCS, volume = "15", number = "3", pages = "286--321", month = aug, year = "1997", CODEN = "ACSYEC", ISSN = "0734-2071 (print), 1557-7333 (electronic)", ISSN-L = "0734-2071", bibdate = "Wed Jan 13 18:36:53 MST 1999", bibsource = "http://www.acm.org/pubs/contents/journals/tocs/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/journals/tocs/1997-15-3/p286-krieger/", abstract = "The Hurricane File System (HFS) is designed for (potentially large-scale) shared-memory multiprocessors. Its architecture is based on the principle that, in order to maximize performance for applications with diverse requirements, a file system must support a wide variety of file structures, file system policies, and I/O interfaces. Files in HFS are implemented using simple building blocks composed in potentially complex ways. This approach yields great flexibility, allowing an application to customize the structure and policies of a file to exactly meet its requirements. As an extreme example, HFS allows a file's structure to be optimized for concurrent random-access write-only operations by 10 threads, something no other file system can do. Similarly, the prefetching, locking, and file cache management policies can all be chosen to match an application's access pattern. In contrast, most parallel file systems support a single file structure and a small set of policies. We have implemented HFS as part of the Hurricane operating system running on the Hector shared-memory multiprocessor. We demonstrate that the flexibility of HFS comes with little processing or I/O overhead. We also show that for a number of file access patterns, HFS is able to deliver to the applications the full I/O bandwidth of the disks on our system.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Computer Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J774", keywords = "design; performance", subject = "{\bf D.4.3} Software, OPERATING SYSTEMS, File Systems Management, File organization. {\bf D.4.3} Software, OPERATING SYSTEMS, File Systems Management, Access methods. {\bf D.4.8} Software, OPERATING SYSTEMS, Performance, Measurements. {\bf E.5} Data, FILES, Optimization**. {\bf E.5} Data, FILES, Organization/structure.", } @Article{Kwak:1997:VMN, author = "H. Kwak and B. Lee and A. R. Hurson", title = "Viability of Multithreading on Networks of Workstations", journal = j-LECT-NOTES-COMP-SCI, volume = "1277", pages = "216--??", year = "1997", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Tue Apr 28 08:51:33 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @PhdThesis{Lang:1997:MTE, author = "Duncan Walter Temple Lang", title = "A multi-threaded extension to a high level interactive statistical computing environment", type = "Thesis ({Ph.D. in Statistics})", school = "University of California, Berkeley", address = "Berkeley, CA, USA", pages = "vii + 161", month = dec, year = "1997", LCCN = "308t 1997 951", bibdate = "Fri Aug 7 08:29:38 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Dissertations -- Academic -- UCB -- statistics -- 1991--2000; University of California, Berkeley. Dept. of Statistics -- Dissertations", } @Article{Larbi:1997:BRM, author = "Michael Larbi", title = "Book Review: {Multithreading Applications in Win32}", journal = j-CCCUJ, volume = "15", number = "7", pages = "65--??", month = jul, year = "1997", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Thu Jun 26 14:12:46 1997", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @Article{Leiserson:1997:AAM, author = "C. E. Leiserson", title = "Algorithmic analysis of multithreaded algorithms", journal = j-LECT-NOTES-COMP-SCI, volume = "1350", pages = "132--??", year = "1997", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Tue Apr 28 08:51:33 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @MastersThesis{Leven:1997:MIR, author = "Peter J. Leven", title = "A multithreaded implementation of a {Robot Control C Library}", type = "Thesis ({M.S.})", school = "University of Illinois at Urbana-Champaign", address = "Urbana-Champaign, IL, USA", pages = "x + 72", year = "1997", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Lo:1997:CTL, author = "Jack L. Lo and Joel S. Emer and Henry M. Levy and Rebecca L. Stamm and Dean M. Tullsen", title = "Converting Thread-Level Parallelism to Instruction-Level Parallelism via Simultaneous Multithreading", journal = j-TOCS, volume = "15", number = "3", pages = "322--354", month = aug, year = "1997", CODEN = "ACSYEC", ISSN = "0734-2071 (print), 1557-7333 (electronic)", ISSN-L = "0734-2071", bibdate = "Wed Jan 13 18:36:53 MST 1999", bibsource = "http://www.acm.org/pubs/contents/journals/tocs/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/journals/tocs/1997-15-3/p322-lo/", abstract = "To achieve high performance, contemporary computer systems rely on two forms of parallelism: instruction-level parallelism (ILP) and thread-level parallelism (TLP). Wide-issue super-scalar processors exploit ILP by executing multiple instructions from a single program in a single cycle. Multiprocessors (MP) exploit TLP by executing different threads in parallel on different processors. Unfortunately, both parallel processing styles statically partition processor resources, thus preventing them from adapting to dynamically changing levels of ILP and TLP in a program. With insufficient TLP, processors in an MP will be idle; with insufficient ILP, multiple-issue hardware on a superscalar is wasted. This article explores parallel processing on an alternative architecture, simultaneous multithreading (SMT), which allows multiple threads to complete for and share all of the processor's resources every cycle. The most compelling reason for running parallel applications on an SMT processor is its ability to use thread-level parallelism and instruction-level parallelism interchangeably. By permitting multiple threads to share the processor's functional units simultaneously, the processor can use both ILP and TLP to accommodate variations in parallelism. When a program has only a single thread, all of the SMT processor's resources can be dedicated to that thread; when more TLP exists, this parallelism can compensate for a lack of per-thread ILP. We examine two alternative on-chip parallel architectures for the next generation of processors. We compare SMT and small-scale, on-chip multiprocessors in their ability to exploit both ILP and TLP. First, we identify the hardware bottlenecks that prevent multiprocessors from effectively exploiting ILP. Then, we show that because of its dynamic resource sharing, SMT avoids these inefficiencies and benefits from being able to run more threads on a single processor. The use of TLP is especially advantageous when per-thread ILP is limited. The ease of adding additional thread contexts on an SMT (relative to adding additional processors on an MP) allows simultaneous multithreading to expose more parallelism, further increasing functional unit utilization and attaining a 52\% average speedup (versus a four-processor, single-chip multiprocessor with comparable execution resources). This study also addresses an often-cited concern regarding the use of thread-level parallelism or multithreading: interference in the memory system and branch prediction hardware. We find the multiple threads cause interthread interference in the caches and place greater demands on the memory system, thus increasing average memory latencies. By exploiting threading-level parallelism, however, SMT hides these additional latencies, so that they only have a small impact on total program performance. We also find that for parallel applications, the additional threads have minimal effects on branch prediction.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Computer Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J774", keywords = "measurement; performance", subject = "{\bf C.1.2} Computer Systems Organization, PROCESSOR ARCHITECTURES, Multiple Data Stream Architectures (Multiprocessors), Parallel processors**. {\bf C.0} Computer Systems Organization, GENERAL, Instruction set design. {\bf D.4.1} Software, OPERATING SYSTEMS, Process Management.", } @Article{Lo:1997:CTP, author = "Jack L. Lo and Joel S. Emer and Henry M. Levy and Rebecca L. Stamm and Dean M. Tullsen", title = "Converting Thread-Level Parallelism to Instruction-Level Parallelism via Simultaneous Multithreading", journal = j-TOCS, volume = "15", number = "3", pages = "322--354", month = aug, year = "1997", CODEN = "ACSYEC", ISSN = "0734-2071 (print), 1557-7333 (electronic)", ISSN-L = "0734-2071", bibdate = "Wed Jan 13 18:36:53 MST 1999", bibsource = "http://www.acm.org/pubs/contents/journals/tocs/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/tocs.bib", URL = "http://www.acm.org:80/pubs/citations/journals/tocs/1997-15-3/p322-lo/", abstract = "To achieve high performance, contemporary computer systems rely on two forms of parallelism: instruction-level parallelism (ILP) and thread-level parallelism (TLP). Wide-issue super-scalar processors exploit ILP by executing multiple instructions from a single program in a single cycle. Multiprocessors (MP) exploit TLP by executing different threads in parallel on different processors. Unfortunately, both parallel processing styles statically partition processor resources, thus preventing them from adapting to dynamically changing levels of ILP and TLP in a program. With insufficient TLP, processors in an MP will be idle; with insufficient ILP, multiple-issue hardware on a superscalar is wasted. This article explores parallel processing on an alternative architecture, simultaneous multithreading (SMT), which allows multiple threads to complete for and share all of the processor's resources every cycle. The most compelling reason for running parallel applications on an SMT processor is its ability to use thread-level parallelism and instruction-level parallelism interchangeably. By permitting multiple threads to share the processor's functional units simultaneously, the processor can use both ILP and TLP to accommodate variations in parallelism. When a program has only a single thread, all of the SMT processor's resources can be dedicated to that thread; when more TLP exists, this parallelism can compensate for a lack of per-thread ILP. We examine two alternative on-chip parallel architectures for the next generation of processors. We compare SMT and small-scale, on-chip multiprocessors in their ability to exploit both ILP and TLP. First, we identify the hardware bottlenecks that prevent multiprocessors from effectively exploiting ILP. Then, we show that because of its dynamic resource sharing, SMT avoids these inefficiencies and benefits from being able to run more threads on a single processor. The use of TLP is especially advantageous when per-thread ILP is limited. The ease of adding additional thread contexts on an SMT (relative to adding additional processors on an MP) allows simultaneous multithreading to expose more parallelism, further increasing functional unit utilization and attaining a 52\% average speedup (versus a four-processor, single-chip multiprocessor with comparable execution resources). This study also addresses an often-cited concern regarding the use of thread-level parallelism or multithreading: interference in the memory system and branch prediction hardware. We find the multiple threads cause interthread interference in the caches and place greater demands on the memory system, thus increasing average memory latencies. By exploiting threading-level parallelism, however, SMT hides these additional latencies, so that they only have a small impact on total program performance. We also find that for parallel applications, the additional threads have minimal effects on branch prediction.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Computer Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J774", keywords = "measurement; performance", subject = "{\bf C.1.2} Computer Systems Organization, PROCESSOR ARCHITECTURES, Multiple Data Stream Architectures (Multiprocessors), Parallel processors**. {\bf C.0} Computer Systems Organization, GENERAL, Instruction set design. {\bf D.4.1} Software, OPERATING SYSTEMS, Process Management.", } @TechReport{LoCocero:1997:MML, author = "Joseph LoCocero and D. E. (Donald E.) Thomas", title = "A multithreaded, multiple language hardware\slash software cosimulator", type = "Research report", number = "CMUCAD-97-13", institution = "Center for Electronic Design Automation, Carnegie Mellon University", address = "Pittsburgh, PA, USA", pages = "7", month = apr, year = "1997", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Functional verification of mixed hardware/software systems is vital to guaranteeing a correct, operational system. This paper discusses a new multithreaded, multiple-language cosimulator that directly combines Verilog and C/C++, the native languages most often used by hardware and software designers. The interface between the two languages is specified in detail, as are some illustrative examples. The performance is shown to be clearly better than UNIX socket-based cosimulation approaches. Further, it naturally fits a cosimulation environment where arbitrary C++ programs and Verilog descriptions are developed concurrently.", acknowledgement = ack-nhfb, annote = "Supported in part by Semiconductor Research Corporation.", keywords = "C (Computer program language); Embedded computer systems -- Simulation methods; Verilog (Computer hardware description language)", } @Article{Loeffler:1997:MJF, author = "G. Loeffler", title = "A Multithreaded {Java} Framework for Solving Linear Elliptic Partial Differential Equations in {3D}", journal = j-LECT-NOTES-COMP-SCI, volume = "1343", pages = "121--??", year = "1997", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Tue Apr 28 08:51:33 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/java.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Lundberg:1997:BMC, author = "L. Lundberg", title = "Bounding the Minimal Completion Time of Static Mappings of Multithreaded {Solaris} Programs", journal = j-LECT-NOTES-COMP-SCI, volume = "1300", pages = "1034--??", year = "1997", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Tue Apr 28 08:51:33 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Mateosian:1997:MNT, author = "R. M. Mateosian", title = "Micro News: {DARPA} aids {Tera MTA}", journal = j-IEEE-MICRO, volume = "17", number = "5", pages = "5--6", month = sep # "\slash " # oct, year = "1997", CODEN = "IEMIDZ", DOI = "https://doi.org/10.1109/MM.1997.621216", ISSN = "0272-1732 (print), 1937-4143 (electronic)", ISSN-L = "0272-1732", bibdate = "Thu Dec 14 06:08:58 MST 2000", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeemicro.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; Science Citation Index database (1980--2000)", URL = "http://dlib.computer.org/mi/books/mi1997/pdf/m5005.pdf", acknowledgement = ack-nhfb, fjournal = "IEEE Micro", journal-URL = "http://www.computer.org/csdl/mags/mi/index.html", } @Article{McCarthy:1997:MTI, author = "Martin McCarthy", title = "Multi-Threading: Intermediate Concepts", journal = j-LINUX-J, volume = "36", pages = "??--??", month = apr, year = "1997", CODEN = "LIJOFX", ISSN = "1075-3583 (print), 1938-3827 (electronic)", ISSN-L = "1075-3583", bibdate = "Fri Oct 9 08:35:26 MDT 1998", bibsource = "http://noframes.linuxjournal.com/lj-issues/issue36/index.html; https://www.math.utah.edu/pub/tex/bib/linux-journal.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "ftp://ftp.ssc.com/pub/lj/listings/issue36/2121.tgz", abstract = "This second part of a series on Multi-threading deals with how to use C programs with one of the POSIX packages available for Linux to handle signals and concurrent threads in global data.", acknowledgement = ack-nhfb, fjournal = "Linux Journal", journal-URL = "http://portal.acm.org/citation.cfm?id=J508", } @Article{McCarthy:1997:WMT, author = "Martin McCarthy", title = "What is Multi-Threading?", journal = j-LINUX-J, volume = "34", pages = "??--??", month = feb, year = "1997", CODEN = "LIJOFX", ISSN = "1075-3583 (print), 1938-3827 (electronic)", ISSN-L = "1075-3583", bibdate = "Fri Oct 9 08:35:26 MDT 1998", bibsource = "http://noframes.linuxjournal.com/lj-issues/issue34/index.html; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "A primer on multi-threading: the process whereby Linux manages several tasks simultaneously.", acknowledgement = ack-nhfb, fjournal = "Linux journal", journal-URL = "http://portal.acm.org/citation.cfm?id=J508", } @Article{McMillan:1997:NSB, author = "Robert McMillan", title = "News: {Sun} boosts {Java} performance, adding {JIT} compiler and {JVM} with multithreading to {Solaris 2.6}", journal = j-JAVAWORLD, volume = "2", number = "7", pages = "??--??", month = jul, year = "1997", CODEN = "????", ISSN = "1091-8906", bibdate = "Thu Aug 13 14:52:27 1998", bibsource = "http://www.javaworld.com/javaworld/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.javaworld.com/javaworld/jw-07-1997/jw-07-speedway.htm", acknowledgement = ack-nhfb, } @Article{Moreno:1997:PMP, author = "E. D. Moreno and S. T. Kofuji and M. H. Cintra", title = "Prefetching and Multithreading Performance in Bus-Based Multiprocessors with {Petri} Nets", journal = j-LECT-NOTES-COMP-SCI, volume = "1300", pages = "1017--??", year = "1997", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Tue Apr 28 08:51:33 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Neves:1997:TRS, author = "Richard Neves and Robert B. Schnabel", title = "Threaded Runtime Support for Execution of Fine Grain Parallel Code on Coarse Grain Multiprocessors", journal = j-J-PAR-DIST-COMP, volume = "42", number = "2", pages = "128--142", day = "1", month = may, year = "1997", CODEN = "JPDCER", DOI = "https://doi.org/10.1006/jpdc.1997.1322", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:19:02 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1997.1322/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1997.1322/production/pdf; http://www.idealibrary.com/links/doi/10.1006/jpdc.1997.1322/production/ref", acknowledgement = ack-nhfb, fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315", } @Book{Oaks:1997:JT, author = "Scott Oaks and Henry Wong", title = "{Java} threads", publisher = pub-ORA, address = pub-ORA:adr, pages = "xiii + 252", year = "1997", ISBN = "1-56592-216-6", ISBN-13 = "978-1-56592-216-7", LCCN = "QA76.73.J38 O25 1997", bibdate = "Fri May 10 12:18:17 MDT 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = "Java series", acknowledgement = ack-nhfb, keywords = "java (computer program language); threads (computer programs)", } @MastersThesis{Ongwattanakul:1997:RDM, author = "Songpol Ongwattanakul", title = "A runtime distributed multithreading library for the {PARC} language", type = "Thesis ({M.E.E.})", school = "Department of Electrical Engineering, University of Alabama", address = "Tuscaloosa, AL, USA", pages = "viii + 71", year = "1997", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Multiprocessors; Parallel processing (Electronic computers)", } @Article{Onion:1997:MM, author = "F. Onion", title = "Multithreading in {MFC}", journal = j-C-PLUS-PLUS-REPORT, volume = "9", number = "3", pages = "50--53, 56", month = mar, year = "1997", CODEN = "CRPTE7", ISSN = "1040-6042", bibdate = "Thu Apr 24 09:46:14 MDT 1997", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classification = "C6110J (Object-oriented programming); C6115 (Programming support); C6150J (Operating systems)", fjournal = "C++ Report", keywords = "API calls; application program interfaces; Internet queries; MFC; multiprogramming; multithreaded programming; object oriented programming; object-oriented programming; remote database hits; software libraries; software tools; threads; user interface; user interfaces; Windows", treatment = "P Practical", } @Article{Park:1997:HPM, author = "Sung-Yong Park and Salim Hariri", title = "A High Performance Message Passing System for {Network of Workstations}", journal = j-J-SUPERCOMPUTING, volume = "11", number = "2", pages = "159--180", month = oct, year = "1997", CODEN = "JOSUED", DOI = "https://doi.org/10.1023/A:1007912007767", ISSN = "0920-8542 (print), 1573-0484 (electronic)", ISSN-L = "0920-8542", bibdate = "Wed Jul 6 12:13:07 MDT 2005", bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=11&issue=2; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.wkap.nl/issuetoc.htm/0920-8542+11+2+1997", URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=11&issue=2&spage=159; http://www.wkap.nl/oasis.htm/149826", acknowledgement = ack-nhfb, classification = "C5620W (Other computer networks); C6150N (Distributed systems software)", corpsource = "Dept. of Electr. and Comput. Eng., Syracuse Univ., NY, USA", fjournal = "The Journal of Supercomputing", journal-URL = "http://link.springer.com/journal/11227", keywords = "application programming interface; asynchronous transfer mode; ATM; ATM network; device driver; distributed computing; high performance; message passing; message-passing system; multithreaded message-passing system; NCS; network of workstations; NOW environment; NYNET; wide area network; wide area networks", pubcountry = "Netherlands", treatment = "P Practical", } @Book{Prasad:1997:MPT, author = "Shashi Prasad", title = "Multithreading programming techniques", publisher = pub-MCGRAW-HILL, address = pub-MCGRAW-HILL:adr, pages = "xix + 410", year = "1997", ISBN = "0-07-912250-7, 0-07-050710-4 (Computer disk)", ISBN-13 = "978-0-07-912250-6, 978-0-07-050710-4 (Computer disk)", LCCN = "QA76.76.D47 P72 1997", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = "The J. Ranade workstation series", acknowledgement = ack-nhfb, annote = "System requirements: C programming language.", keywords = "Application software -- Development; C (Computer program language); Cross-platform software development", } @Article{Ravoor:1997:MTP, author = "Suresh B. Ravoor and Johnny S. K. Wong", title = "Multithreaded Transaction Processing in Distributed Systems", journal = j-J-SYST-SOFTW, volume = "38", number = "2", pages = "107--117", month = aug, year = "1997", CODEN = "JSSODM", ISSN = "0164-1212 (print), 1873-1228 (electronic)", ISSN-L = "0164-1212", bibdate = "Wed Dec 16 08:24:49 MST 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "The Journal of systems and software", journal-URL = "http://www.sciencedirect.com/science/journal/01641212", } @Article{Savage:1997:EDD, author = "Stefan Savage and Michael Burrows and Greg Nelson and Patrick Sobalvarro and Thomas Anderson", title = "{Eraser}: a Dynamic Data Race Detector for Multithreaded Programs", journal = j-TOCS, volume = "15", number = "4", pages = "391--411", month = nov, year = "1997", CODEN = "ACSYEC", ISSN = "0734-2071 (print), 1557-7333 (electronic)", ISSN-L = "0734-2071", bibdate = "Wed Jan 13 18:36:53 MST 1999", bibsource = "http://www.acm.org/pubs/contents/journals/tocs/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "Co-published in {\em Operating Systems Review}, {\bf 31}(5).", URL = "http://www.acm.org:80/pubs/citations/journals/tocs/1997-15-4/p391-savage/", abstract = "Multithreaded programming is difficult and error prone. It is easy to make a mistake in synchronization that produces a data race, yet it can be extremely hard to locate this mistake during debugging. This article describes a new tool, called Eraser, for dynamically detecting data races in lock-based multithreaded programs. Eraser uses binary rewriting techniques to monitor every shared-monory reference and verify that consistent locking behavior is observed. We present several case studies, including undergraduate coursework and a multithreaded Web search engine, that demonstrate the effectiveness of this approach.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Computer Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J774", keywords = "algorithms; experimentation; reliability", subject = "{\bf D.2.5} Software, SOFTWARE ENGINEERING, Testing and Debugging, Monitors. {\bf D.1.3} Software, PROGRAMMING TECHNIQUES, Concurrent Programming, Parallel programming. {\bf D.2.5} Software, SOFTWARE ENGINEERING, Testing and Debugging, Debugging aids. {\bf D.2.5} Software, SOFTWARE ENGINEERING, Testing and Debugging, Tracing. {\bf D.4.1} Software, OPERATING SYSTEMS, Process Management, Concurrency. {\bf D.4.1} Software, OPERATING SYSTEMS, Process Management, Deadlocks. {\bf D.4.1} Software, OPERATING SYSTEMS, Process Management, Multiprocessing/multiprogramming/multitasking. {\bf D.4.1} Software, OPERATING SYSTEMS, Process Management, Mutual exclusion.", } @Article{Shepherd:1997:UCA, author = "George Shepherd and Scot Wingo", title = "Undocumented Corner: {ATL} and the {IUknown} Interface", journal = j-DDJ, volume = "22", number = "8", pages = "119--123", month = aug, year = "1997", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Mon Aug 11 11:38:10 MDT 1997", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "George and Scot continue their examination of Microsoft's Active Template Library, this month looking at the heart of ATL, including its support for multithreading and its various implementations of IUnknown.", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @Article{Shoffner:1997:JSSa, author = "Michael Shoffner", title = "{Java} Step by Step: Write your own threaded discussion forum", journal = j-JAVAWORLD, volume = "2", number = "2", pages = "??--??", month = feb, year = "1997", CODEN = "????", ISSN = "1091-8906", bibdate = "Thu Aug 13 14:52:24 1998", bibsource = "http://www.javaworld.com/javaworld/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.javaworld.com/javaworld/jw-02-1997/jw-02-step.htm", acknowledgement = ack-nhfb, } @Article{Shoffner:1997:JSSb, author = "Michael Shoffner", title = "{Java} Step By Step: Write your own threaded discussion forum: The communications and server components", journal = j-JAVAWORLD, volume = "2", number = "3", pages = "??--??", month = mar, year = "1997", CODEN = "????", ISSN = "1091-8906", bibdate = "Thu Aug 13 14:52:25 1998", bibsource = "http://www.javaworld.com/javaworld/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.javaworld.com/javaworld/jw-03-1997/jw-03-step.htm", acknowledgement = ack-nhfb, } @Article{Sime:1997:GPM, author = "J. Sime", title = "Guarded pointers: moving smart pointers into multithreaded systems", journal = j-C-PLUS-PLUS-REPORT, volume = "9", number = "4", pages = "32--41", month = apr, year = "1997", CODEN = "CRPTE7", ISSN = "1040-6042", bibdate = "Thu Apr 24 09:46:14 MDT 1997", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classification = "C6110J (Object-oriented programming); C6120 (File organisation); C6130 (Data handling techniques); C6150N (Distributed systems software)", fjournal = "C++ Report", keywords = "abstract data types; C listings; concurrency control; concurrency control pattern; data integrity; exception handling; guarded pointers; multiprogramming; multithreaded systems; object-oriented programming; protected data resource; protection proxy pattern; reference count lock; safety; smart pointers; thread safety mechanisms", treatment = "P Practical", } @Article{Sinharoy:1997:OTC, author = "Balaram Sinharoy", title = "Optimized Thread Creation for Processor Multithreading", journal = j-COMP-J, volume = "40", number = "6", pages = "388--??", month = "????", year = "1997", CODEN = "CMPJA6", ISSN = "0010-4620 (print), 1460-2067 (electronic)", ISSN-L = "0010-4620", bibdate = "Wed Jul 21 09:55:15 MDT 1999", bibsource = "https://www.math.utah.edu/pub/tex/bib/compj1990.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www3.oup.co.uk/computer_journal/Volume_40/Issue_06/Vol40_06.index.html", URL = "http://www.oup.co.uk/computer_journal/Volume_40/Issue_06/Vol40_06.body.html#AbstractSinharoy; http://www3.oup.co.uk/computer_journal/Volume_40/Issue_06/Vol40_06.body.html#AbstractSinharoy", acknowledgement = ack-nhfb, email-1 = "balaram@watson.ibm.com", fjournal = "The Computer Journal", journal-URL = "http://comjnl.oxfordjournals.org/", } @Article{Sodan:1997:ENN, author = "Angela Sodan and Guang R. Gao and Olivier Maquelin and Jens-Uwe Schultz and Xin-Min Tian", title = "Experiences with Non-numeric Applications on Multithreaded Architectures", journal = j-SIGPLAN, volume = "32", number = "7", pages = "124--135", month = jul, year = "1997", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:17:35 MST 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan1990.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Sohn:1997:DWD, author = "Andrew Sohn and Mitsuhisa Sato and Namhoon Yoo and Jean-Luc Gaudiot", title = "Data and Workload Distribution in a Multithreaded Architecture", journal = j-J-PAR-DIST-COMP, volume = "40", number = "2", pages = "256--264", day = "1", month = feb, year = "1997", CODEN = "JPDCER", DOI = "https://doi.org/10.1006/jpdc.1996.1262", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:19:02 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.1262/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.1262/production/pdf; http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.1262/production/ref", acknowledgement = ack-nhfb, fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315", } @Article{Stewart:1997:MDH, author = "David B. Stewart and Pradeep K. Khosla", title = "Mechanisms for Detecting and Handling Timing Errors", journal = j-CACM, volume = "40", number = "1", pages = "87--93", month = jan, year = "1997", CODEN = "CACMA2", ISSN = "0001-0782 (print), 1557-7317 (electronic)", ISSN-L = "0001-0782", bibdate = "Fri Oct 10 18:17:54 MDT 1997", bibsource = "http://www.acm.org/pubs/toc/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org/pubs/citations/journals/cacm/1997-40-1/p87-stewart/", acknowledgement = ack-nhfb, classification = "C6110B (Software engineering techniques); C6130 (Data handling techniques); C6150J (Operating systems)", corpsource = "Inst. for Adv. Comput. Studies, Maryland Univ., College Park, MD, USA", fjournal = "Communications of the ACM", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J79", keywords = "adaptive real-time scheduling; aperiodic servers; Chimera; design; error handling; imprecise computation; low-overhead policy-independent system; management; operating systems (computers); performance; periodic threads; real- time operating system; real-time systems; real-time systems analysis; real-time threads; reliability; scheduling; scheduling policies; software fault tolerance; specifications; system failure; theory; timing; timing error detection; worst-case execution times", subject = "{\bf K.6.3} Computing Milieux, MANAGEMENT OF COMPUTING AND INFORMATION SYSTEMS, Software Management, Software development. {\bf C.3} Computer Systems Organization, SPECIAL-PURPOSE AND APPLICATION-BASED SYSTEMS, Real-time systems. {\bf C.4} Computer Systems Organization, PERFORMANCE OF SYSTEMS.", treatment = "P Practical", } @Article{Taura:1997:FGM, author = "Kenjiro Taura and Akinori Yonezawa", title = "Fine-grain Multithreading with Minimal Compiler Support --- a Cost Effective Approach to Implementing Efficient Multithreading Languages", journal = j-SIGPLAN, volume = "32", number = "5", pages = "320--333", month = may, year = "1997", CODEN = "SINODQ", ISBN = "0-89791-907-6", ISBN-13 = "978-0-89791-907-4", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Thu May 13 12:37:28 MDT 1999", bibsource = "http://www.acm.org/pubs/contents/proceedings/pldi/258915/index.html; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan1990.bib", URL = "http://www.acm.org:80/pubs/citations/proceedings/pldi/258915/p320-taura/", acknowledgement = ack-nhfb, annote = "Published as part of the Proceedings of PLDI'97.", fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "design; languages; measurement; performance; standardization; theory", subject = "{\bf D.3.4} Software, PROGRAMMING LANGUAGES, Processors, Compilers. {\bf D.3.3} Software, PROGRAMMING LANGUAGES, Language Constructs and Features, Data types and structures. {\bf D.3.2} Software, PROGRAMMING LANGUAGES, Language Classifications. {\bf D.3.4} Software, PROGRAMMING LANGUAGES, Processors, Code generation. {\bf C.2.2} Computer Systems Organization, COMPUTER-COMMUNICATION NETWORKS, Network Protocols.", } @PhdThesis{TempleLang:1997:MTE, author = "Duncan Walter {Temple Lang}", title = "A multi-threaded extension to a high level interactive statistical computing environment", type = "Thesis ({Ph.D. in Statistics})", school = "Dept. of Statistics, University of California, Berkeley", address = "Berkeley, CA, USA", pages = "vii + 161", month = dec, year = "1997", bibdate = "Sat Apr 20 11:15:46 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Thompson:1997:THP, author = "P. Thompson and G. Bumgardner", title = "{Threads.h++}: a portable {C++} library for multithreaded programming", journal = j-C-PLUS-PLUS-REPORT, volume = "9", number = "3", pages = "24--37", month = mar, year = "1997", CODEN = "CRPTE7", ISSN = "1040-6042", bibdate = "Thu Apr 24 09:46:14 MDT 1997", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classification = "C6110B (Software engineering techniques); C6110J (Object-oriented programming); C6115 (Programming support); C6150J (Operating systems)", fjournal = "C++ Report", keywords = "application development; application program interfaces; C language; low-level procedural API; multiprocessor machines; multiprogramming; multithreaded programming; object-oriented abstractions; object-oriented languages; object-oriented programming; operating systems; portable C++ library; responsive performance; software libraries; software portability; synchronisation; synchronization; thread control; thread creation; Threads.h++; Web browsers", treatment = "P Practical", } @Article{Thompson:1997:TPC, author = "P. Thompson and G. Bumgardner", title = "{Threads.h++}: a portable {C++} library for multithreaded programming", journal = j-C-PLUS-PLUS-REPORT, volume = "9", number = "3", pages = "24--37", month = mar, year = "1997", CODEN = "CRPTE7", ISSN = "1040-6042", bibdate = "Thu Apr 24 09:46:14 MDT 1997", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classification = "C6110B (Software engineering techniques); C6110J (Object-oriented programming); C6115 (Programming support); C6150J (Operating systems)", fjournal = "C++ Report", keywords = "application development; application program interfaces; C language; low-level procedural API; multiprocessor machines; multiprogramming; multithreaded programming; object-oriented abstractions; object-oriented languages; object-oriented programming; operating systems; portable C++ library; responsive performance; software libraries; software portability; synchronisation; synchronization; thread control; thread creation; Threads.h++; Web browsers", treatment = "P Practical", } @TechReport{Tsai:1997:PSC, author = "Jenn-Yuan Tsai", title = "Performance study of a concurrent multithreaded processor", type = "Technical report", number = "TR 97-034", institution = "University of Minnesota, Dept. of Computer Science and Engineering", address = "Minneapolis, MN, USA", pages = "24", year = "1997", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The performance of a concurrent multithreaded architectural model, called superthreading [15], is studied in this paper. It tries to integrate optimizing compilation techniques and run-time hardware support to exploit both thread-level and instruction-level parallelism, as opposed to exploit only instruction-level parallelism in existing superscalars. The superthreaded architecture uses a thread pipelining execution model to enhance the overlapping between threads, and to facilitate data dependence enforcement between threads through compiler-directed, hardware-supported, thread-level control speculation and run-time data dependence checking. We also evaluate the performance of the superthreaded processor through a detailed trace-driven simulator. Our results show that the superthreaded execution model can obtain good performance by exploiting both thread-level and instruction-level parallelism in programs. We also study the design parameters of its main system components, such as the size of the memory buffer, the bandwidth requirement of the communication links between thread processing units, and the bandwidth requirement of the shared data cache.", acknowledgement = ack-nhfb, annote = "Supported in part by the National Science Foundation. Supported in part by the U.S. Army Intelligence Center and Fort Huachuca. Supported in part by a gift from Intel Corporation", keywords = "Compilers (Computer programs); Computer architecture; Parallel processing (Electronic computers); Threads (Computer programs)", } @TechReport{Tsai:1997:SIC, author = "Jenn-Yuan Tsai", title = "Superthreading: integrating compilation technology and processor architecture for cost-effective concurrent multithreading", type = "Technical report", number = "TR 97-033", institution = "University of Minnesota, Dept. of Computer Science and Engineering", address = "Minneapolis, MN, USA", pages = "16", day = "29", month = jan, year = "1997", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "As the number of transistors that can be integrated on a single chip continues to grow, it is important for computer architects to think beyond the traditional approaches of deeper pipelines and wider instruction issue units for improving performance. This single-threaded execution model limits these approaches to exploiting only the relatively small amount of instruction-level parallelism available in application programs. While integrating an entire multiprocessor onto a single chip is feasible, this architecture is limited to exploiting only relatively coarse-grained heavy-weight parallelism. We propose the superthreaded architecture as an excellent alternative for utilizing the large number of transistors that will become available on a single high-density chip. As a hybrid of a wide-issue superscalar processor and a multiprocessor-on-a-chip, this new concurrent multithreading architecture can leverage the best of existing and future parallel hardware and software technologies. By incorporating speculation for control dependences and run-time checking of data dependences, the superthreaded architecture can exploit the multiple granularities of parallelism available in general-purpose application programs to reduce the execution time of a single program.", acknowledgement = ack-nhfb, annote = "Supported in part by the U.S. Army Intelligence Center and Fort Huachuca. Supported in part by the National Science Foundation. Supported in part by a gift from the Intel Corporation", keywords = "Compilers (Computer programs); Computer architecture; Parallel processing (Electronic computers); Threads (Computer programs)", } @Article{Vanhelsuwe:1997:BRJ, author = "Laurence Vanhelsuw{\'e}", title = "Book Review: The {Java} {Threads} {API} makes it to print media", journal = j-JAVAWORLD, volume = "2", number = "7", pages = "??--??", month = jul, year = "1997", CODEN = "????", ISSN = "1091-8906", bibdate = "Thu Aug 13 14:52:27 1998", bibsource = "http://www.javaworld.com/javaworld/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.javaworld.com/javaworld/jw-07-1997/jw-07-threads.htm", acknowledgement = ack-nhfb, } @Article{Vanhelsuwe:1997:JPE, author = "Laurence Vanhelsuw{\'e}", title = "{JavaBeans}: properties, events, and thread safety", journal = j-JAVAWORLD, volume = "2", number = "9", pages = "??--??", month = sep, year = "1997", CODEN = "????", ISSN = "1091-8906", bibdate = "Thu Aug 13 14:52:28 1998", bibsource = "http://www.javaworld.com/javaworld/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.javaworld.com/javaworld/jw-09-1997/jw-09-raceconditions.htm", acknowledgement = ack-nhfb, } @Article{Venners:1997:UHH, author = "Bill Venners", title = "Under the Hood: How the {Java} virtual machine performs thread synchronization", journal = j-JAVAWORLD, volume = "2", number = "7", pages = "??--??", month = jul, year = "1997", CODEN = "????", ISSN = "1091-8906", bibdate = "Thu Aug 13 14:52:27 1998", bibsource = "http://www.javaworld.com/javaworld/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.javaworld.com/javaworld/jw-07-1997/jw-07-hood.htm", acknowledgement = ack-nhfb, } @Article{Vermeulen:1997:JDW, author = "Alain Vermeulen", title = "{Java} Deadlock: The woes of multithreaded design", journal = j-DDJ, volume = "22", number = "9", pages = "52, 54--56, 88, 89", month = sep, year = "1997", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Mon Aug 11 12:53:44 1997", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @Article{Weisz:1997:MFA, author = "Russell Weisz", title = "More First Aid for the Thread Impaired: Cool Ways to Take Advantage of Multithreading", journal = j-MICROSOFT-SYS-J, volume = "12", number = "7", pages = "33--??", month = jul, year = "1997", CODEN = "MSJOED", ISSN = "0889-9932", bibdate = "Sat Nov 7 10:33:30 MST 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Microsoft Systems Journal", } @Article{Whittaker:1997:TML, author = "Steve Whittaker and Jerry Swanson and Jakov Kucan and Candy Sidner", title = "{TeleNotes}: managing lightweight interactions in the desktop", journal = j-TOCHI, volume = "4", number = "2", pages = "137--168", month = jun, year = "1997", CODEN = "ATCIF4", ISSN = "1073-0516 (print), 1557-7325 (electronic)", ISSN-L = "1073-0516", bibdate = "Tue Jan 19 05:49:17 MST 1999", bibsource = "http://www.acm.org/pubs/contents/journals/tochi/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/journals/tochi/1997-4-2/p137-whittaker/", abstract = "Communication theories and technology have tended to focus on extended, formal meetings and have neglected a prevalent and vital form of workplace communication --- namely, lightweight communication. Unlike formal, extended meetings, lightweight interaction is brief, informal, unplanned, and intermittent. We analyze naturalistic data from a study of work-place communication and derive five design criteria for lightweight interaction systems. These criteria require that systems for lightweight interaction support {\em conversational tracking, rapid connection}, the ability to {\em leave a message}, {\em context management}, and {\em shared real-time objects}. Using these criteria, we evaluate existing interpersonal communications technologies. We then describe an implementation of a system (TeleNotes) that is designed to support lightweight interaction by meeting these criteria. The interface metaphor allows communications to be based around desktop objects, resembling ``sticky notes.'' These objects are also organized into ``desktop piles'' to support conversational threads and provide mechanisms for initiating real-time audio, video, and application sharing. We conducted informal user testing of several system prototypes. Based on our findings, outstanding issues concerning theory and systems design for communication systems are outlined --- in particular, with regard to the issue of managing conversations over time.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Computer-Human Interaction", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J756", keywords = "human factors", subject = "{\bf H.5.3} Information Systems, INFORMATION INTERFACES AND PRESENTATION, Group and Organization Interfaces, Evaluation/methodology. {\bf H.1.2} Information Systems, MODELS AND PRINCIPLES, User/Machine Systems, Human factors. {\bf H.5.3} Information Systems, INFORMATION INTERFACES AND PRESENTATION, Group and Organization Interfaces, Asynchronous interaction. {\bf I.3.6} Computing Methodologies, COMPUTER GRAPHICS, Methodology and Techniques, Interaction techniques. {\bf H.5.3} Information Systems, INFORMATION INTERFACES AND PRESENTATION, Group and Organization Interfaces, Synchronous interaction. {\bf H.5.1} Information Systems, INFORMATION INTERFACES AND PRESENTATION, Multimedia Information Systems, Evaluation/methodology.", } @Article{Wilson:1997:BTP, author = "Greg Wilson", title = "Bookshelf: Threads Primer: a Guide To Multithreaded Programming", journal = j-IEEE-SOFTWARE, volume = "14", number = "5", pages = "116--116", month = sep # "\slash " # oct, year = "1997", CODEN = "IESOEG", ISSN = "0740-7459 (print), 0740-7459 (electronic)", ISSN-L = "0740-7459", bibdate = "Mon Sep 15 22:35:10 1997", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeesoft.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://dlib.computer.org/so/books/so1997/pdf/s5115.pdf", acknowledgement = ack-nhfb, fjournal = "IEEE Software", journal-URL = "http://www.computer.org/portal/web/csdl/magazines/software", } @MastersThesis{Yang:1997:MUA, author = "Chia Wei Yang", title = "A multi-context uniprocessor: another multithreaded architecture", type = "Thesis ({M.S.})", school = "California Polytechnic State University", address = "San Luis Obispo, CA, USA", pages = "viii + 129", year = "1997", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, annote = "Proposes a computer architecture model that adapts all advantages from multithreaded models to a uniprocessor environment.", keywords = "Computer architecture; Multiprocessors; Parallel processing (Electronic Computers)", } @Book{Adamo:1998:MTO, author = "Jean-Marc Adamo", title = "Multi-threaded object-oriented {MPI}-based message passing interface: the {ARCH} library", volume = "SECS 446", publisher = pub-KLUWER, address = pub-KLUWER:adr, pages = "xiv + 185", year = "1998", ISBN = "0-7923-8165-3", ISBN-13 = "978-0-7923-8165-5", LCCN = "TK5102.5.A293 1998", bibdate = "Fri Aug 7 08:29:38 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", price = "US\$120.00", series = "The Kluwer international series in engineering and computer science", acknowledgement = ack-nhfb, keywords = "data transmission systems; object-oriented programming (computer science); threads (computer programs)", libnote = "Not yet in my library.", } @Article{Aiex:1998:CMT, author = "R. M. Aiex and S. L. Martins and C. C. Ribeiro and N. D. L. R. Rodriguez", title = "Cooperative Multi-thread Parallel Tabu Search with an Application to Circuit Partitioning", journal = j-LECT-NOTES-COMP-SCI, volume = "1457", pages = "310--??", year = "1998", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Sat Oct 10 14:40:24 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @InProceedings{Amaranth:1998:TBM, author = "Paul Amaranth", title = "A {Tcl}-based Multithreaded Test Harness", crossref = "USENIX:1998:PSA", pages = "??--??", year = "1998", bibdate = "Fri Oct 18 07:49:55 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://db.usenix.org/publications/library/proceedings/tcl98/amaranth.html", acknowledgement = ack-nhfb, } @Article{Anonymous:1998:MS, author = "Anonymous", title = "Multithreaded System", journal = j-IEEE-MICRO, volume = "18", number = "3", pages = "76--76", month = may # "\slash " # jun, year = "1998", CODEN = "IEMIDZ", ISSN = "0272-1732 (print), 1937-4143 (electronic)", ISSN-L = "0272-1732", bibdate = "Thu Dec 14 06:08:58 MST 2000", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeemicro.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; Science Citation Index database (1980--2000)", acknowledgement = ack-nhfb, fjournal = "IEEE Micro", journal-URL = "http://www.computer.org/csdl/mags/mi/index.html", } @Article{Anonymous:1998:NTS, author = "Anonymous", title = "New Tools: Software Development: {Uniscape}'s Internationalization Library; {Global Technologies}' {Unix-to-NT} Solution; {KAI}'s Multithreaded {Java} Debugging Tool; {Price Systems}' Parametric Forecasting Tool", journal = j-COMPUTER, volume = "31", number = "6", pages = "98, 102", month = jun, year = "1998", CODEN = "CPTRB4", ISSN = "0018-9162 (print), 1558-0814 (electronic)", ISSN-L = "0018-9162", bibdate = "Thu Jun 4 08:22:02 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/computer1990.bib; https://www.math.utah.edu/pub/tex/bib/java.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://dlib.computer.org/co/books/co1998/pdf/r6098.pdf", acknowledgement = ack-nhfb, fjournal = "Computer", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2", } @Article{Ball:1998:MTA, author = "Steve Ball and John Miller Crawford", title = "Multi-Threaded Assignment Surprises", journal = j-JAVA-REPORT, volume = "3", number = "??", pages = "??--??", month = sep, year = "1998", CODEN = "JREPFI", ISSN = "1086-4660", bibdate = "Sat Dec 26 13:52:53 1998", bibsource = "http://archive.javareport.com/9809/html/from_pages/index.shtml; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://archive.javareport.com/9809/html/from_pages/ftp_col1.shtml", abstract = "A volatile brew is formed by mixing assignment and threads. Perils and surprises lurk within the most innocent-looking statement. We expose those perils and surprises and point out where you need to proceed with due caution to ensure the effective use of locked objects.", acknowledgement = ack-nhfb, } @Article{Bangs:1998:BOS, author = "Gaurav Bangs and Peter Druschel and Jeffrey C. Mogul", title = "Better operating system features for faster network servers", journal = j-SIGMETRICS, volume = "26", number = "3", pages = "23--30", month = dec, year = "1998", CODEN = "????", DOI = "https://doi.org/10.1145/306225.306234", ISSN = "0163-5999 (print), 1557-9484 (electronic)", ISSN-L = "0163-5999", bibdate = "Thu Jun 26 11:27:29 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Widely-used operating systems provide inadequate support for large-scale Internet server applications. Their algorithms and interfaces fail to efficiently support either event-driven or multi-threaded servers. They provide poor control over the scheduling and management of machine resources, making it difficult to provide robust and controlled service. We propose new UNIX interfaces to improve scalability, and to provide fine-grained scheduling and resource management.", acknowledgement = ack-nhfb, fjournal = "ACM SIGMETRICS Performance Evaluation Review", journal-URL = "http://portal.acm.org/toc.cfm?id=J618", } @TechReport{Beebe:1998:BPA, author = "Nelson H. F. Beebe", title = "A Bibliography of Publications about Multithreading", institution = inst-CSC, address = inst-CSC:adr, pages = "15", day = "7", month = aug, year = "1998", bibdate = "Sat Apr 11 10:26:14 1998", bibsource = "https://www.math.utah.edu/pub/bibnet/authors/b/beebe-nelson-h-f.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "This report is updated frequently.", URL = "https://www.math.utah.edu/pub/tex/bib/index-table-m.html#multithreading", } @Article{Biagioni:1998:SST, author = "Edoardo Biagioni and Ken Cline and Peter Lee and Chris Okasaki and Chris Stone", title = "Safe-for-Space Threads in {Standard ML}", journal = j-HIGHER-ORDER-SYMB-COMPUT, volume = "11", number = "2", pages = "209--225", month = dec, year = "1998", CODEN = "LSCOEX", DOI = "https://doi.org/10.1023/A:1010016600604", ISSN = "1388-3690 (print), 2212-0793 (electronic)", ISSN-L = "1388-3690", bibdate = "Wed Jul 6 15:50:28 MDT 2005", bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=1388-3690&volume=11&issue=2; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.wkap.nl/issuetoc.htm/1388-3690+11+2+1998; OCLC Contents1st database", URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=1388-3690&volume=11&issue=2&spage=209; http://www.wkap.nl/oasis.htm/187569", acknowledgement = ack-nhfb, fjournal = "Higher-Order and Symbolic Computation", } @TechReport{Bic:1998:MAD, author = "Lubomir Bic and Michael B. Dillencourt and Munehiro Fukuda", title = "Mobile agents, {DSM}, coordination, and self-migrating threads: a common framework", type = "UCI-ICS technical report", number = "98-33", institution = "Information and Computer Science, University of California, Irvine", address = "Irvine, CA", pages = "11", day = "8", month = oct, year = "1998", LCCN = "Z699 .C3 no.98-33", bibdate = "Fri May 10 12:18:17 MDT 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "distributed shared memory; intelligent agents (computer software)", } @Article{Blumofe:1998:SES, author = "Robert D. Blumofe and Charles E. Leiserson", title = "Space-Efficient Scheduling of Multithreaded Computations", journal = j-SIAM-J-COMPUT, volume = "27", number = "1", pages = "202--229", month = feb, year = "1998", CODEN = "SMJCAT", ISSN = "0097-5397 (print), 1095-7111 (electronic)", ISSN-L = "0097-5397", bibdate = "Sat Dec 5 17:26:53 MST 1998", bibsource = "http://epubs.siam.org/sam-bin/dbq/toclist/SICOMP/27/1; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://epubs.siam.org/sam-bin/dbq/article/25947", acknowledgement = ack-nhfb, fjournal = "SIAM Journal on Computing", journal-URL = "http://epubs.siam.org/sicomp", } @InProceedings{Brunett:1998:IET, author = "Sharon M. Brunett and John Thornley and Marrq Ellenbecker", title = "An Initial Evaluation of the {Tera} Multithreaded Architecture and Programming System Using the {C3I} Parallel Benchmark Suite", crossref = "ACM:1998:SHP", pages = "??--??", year = "1998", bibdate = "Wed Mar 06 06:27:47 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.supercomp.org/sc98/papers/", URL = "http://www.supercomp.org/sc98/TechPapers/sc98_FullAbstracts/Brunett1063/Index.htm", acknowledgement = ack-nhfb, } @InProceedings{Caromel:1998:JFS, author = "Denis Caromel and Julien Vayssiere", title = "A {Java} Framework for Seamless Sequential, Multi-threaded, and Distributed Programming", crossref = "ACM:1998:AWJ", pages = "??--??", year = "1998", bibdate = "Thu Apr 27 10:43:08 2000", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.cs.ucsb.edu/conferences/java98/papers/javapp.pdf; http://www.cs.ucsb.edu/conferences/java98/papers/javapp.ps", acknowledgement = ack-nhfb, } @Article{Chapman:1998:OHI, author = "B. Chapman and P. Mehrotra", title = "{OpenMP} and {HPF}: Integrating Two Paradigms", journal = j-LECT-NOTES-COMP-SCI, volume = "1470", pages = "650--??", year = "1998", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Sat Oct 10 14:40:24 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Chen:1998:MTO, author = "Jiajun Chen and Xiaodong Yuan and Guolian Zhengp", title = "A multi-threaded object-oriented programming model", journal = j-SIGSOFT, volume = "23", number = "3", pages = "83--86", month = may, year = "1998", CODEN = "SFENDP", DOI = "https://doi.org/10.1145/279437.279477", ISSN = "0163-5948 (print), 1943-5843 (electronic)", ISSN-L = "0163-5948", bibdate = "Wed Aug 1 17:13:36 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigsoft1990.bib", abstract = "This paper presents a concurrent object-oriented programming (COOP) model established around concurrent objects which may have a body. Once an object with a body is created, its body begins to run as a separate execution thread of the object. Distinguished from some active-object-based concurrent object-oriented models, the object body in our model is not used for the concurrency control of objects, but only as a mechanism to introduce concurrent executions into OO model. Concurrency control is specified by the attributes of objects and the control codes are generated by a compiling system based on these attributes. In addition, objects should be designed in such a way that they can be used in both sequential and concurrent environments, no matter whether they have a body or not. In our model, several execution threads may coexist in an object and some synchronization mechanisms are provided to control the concurrent executions of these threads. The paper presents two examples of concurrent programming with our model.", acknowledgement = ack-nhfb, fjournal = "ACM SIGSOFT Software Engineering Notes", journal-URL = "https://dl.acm.org/citation.cfm?id=J728", } @Book{Cohen:1998:WMP, author = "Aaron Cohen and Mike Woodring", title = "{Win32} Multithreaded Programming", publisher = pub-ORA, address = pub-ORA:adr, pages = "xv + 705", year = "1998", ISBN = "1-56592-296-4", ISBN-13 = "978-1-56592-296-9", LCCN = "QA76.76.O63 C633 1998", bibdate = "Fri Aug 7 08:29:38 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", price = "US\$39.95", URL = "http://www.ora.com/catalog/multithread/; http://www.oreilly.com/catalog/multithread", acknowledgement = ack-nhfb, keywords = "Microsoft Win32; Microsoft Windows (Computer file); Operating systems (Computers)", } @Article{Criscolo:1998:JQ, author = "Mike Criscolo", title = "{Java Q\&A}: How Do {I} Queue {Java} Threads?", journal = j-DDJ, volume = "23", number = "10", pages = "127--129", month = oct, year = "1998", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Fri Sep 11 09:12:05 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.ddj.com/ftp/1998/1998_10/jqa108.txt; http://www.ddj.com/ftp/1998/1998_10/jqa108.zip", abstract = "In examining queuing techniques in Java, Mike presents one approach to multithreading he has implemented, and examines the differences between centralized- and distributed-queuing models. Additional resources include jqa108.txt (listings) and jqa108.zip (source code).", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @Article{Criscolo:1998:JQH, author = "Mike Criscolo", title = "{Java Q and A}: How Do {I} Queue {Java} Threads?", journal = j-DDJ, volume = "23", number = "10", pages = "127--129", month = oct, year = "1998", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Fri Sep 11 09:12:05 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.ddj.com/ftp/1998/1998_10/jqa108.txt; http://www.ddj.com/ftp/1998/1998_10/jqa108.zip", abstract = "In examining queuing techniques in Java, Mike presents one approach to multithreading he has implemented, and examines the differences between centralized- and distributed-queuing models. Additional resources include jqa108.txt (listings) and jqa108.zip (source code).", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @Article{Cromwell:1998:PBD, author = "Jeff Cromwell", title = "Programmer's Bookshelf: The Dawning of the Age of Multithreading", journal = j-DDJ, volume = "23", number = "9", pages = "127, 129", month = sep, year = "1998", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Wed Aug 05 10:12:23 1998", bibsource = "http://www.ddj.com/ddj/1998/1998_09/index.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "", abstract = "Jeff's focus this month is multithreading, as he examines {\em Multithreading Programming Techniques in Win32}, by Jim Beveridge and R. Wiener, {\em Object-Oriented Multithreading Using C++}, by Cameron and Tracy Hughes, and {\em Multithreading Programming Techniques}, by Shashi Prasad.", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @Article{Dagum:1998:OIS, author = "Leonardo Dagum and Ramesh Menon", title = "{OpenMP}: An Industry-Standard {API} for Shared-Memory Programming", journal = j-IEEE-COMPUT-SCI-ENG, volume = "5", number = "1", pages = "46--55", month = jan # "\slash " # mar, year = "1998", CODEN = "ISCEE4", DOI = "https://doi.org/10.1109/99.660313", ISSN = "1070-9924 (print), 1558-190X (electronic)", ISSN-L = "1070-9924", bibdate = "Sat Jan 9 08:57:23 MST 1999", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://dlib.computer.org/cs/books/cs1998/pdf/c1046.pdf; http://www.computer.org/cse/cs1998/c1046abs.htm", acknowledgement = ack-nhfb, fjournal = "IEEE Computational Science \& Engineering", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=99", } @Article{DeRusso:1998:MEH, author = "Joe {DeRusso, III} and Peter Haggar", title = "Multithreaded Exception Handling in {Java}", journal = j-JAVA-REPORT, volume = "3", number = "??", pages = "??--??", month = aug, year = "1998", CODEN = "JREPFI", ISSN = "1086-4660", bibdate = "Sat Dec 26 13:52:53 1998", bibsource = "http://archive.javareport.com/9808/html/from_pages/index.shtml; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://archive.javareport.com/9808/html/from_pages/ftp_feature.shtml", abstract = "Introducing new classes and interfaces to be used when writing multithreaded Java programs. These classes are small, easy to use, and effectively enable you to handle exceptions occurring on secondary threads.", acknowledgement = ack-nhfb, } @Article{Dyer:1998:CAS, author = "Dave Dyer", title = "Can {Assure} save {Java} from the perils of multithreading?", journal = j-JAVAWORLD, volume = "3", number = "10", pages = "??--??", year = "1998", CODEN = "????", ISSN = "1091-8906", bibdate = "Mon Jan 4 06:11:43 MST 1999", bibsource = "http://www.javaworld.com/javaworld/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.javaworld.com/javaworld/jw-10-1998/jw-10-assure.htm", acknowledgement = ack-nhfb, } @Article{Eskilson:1998:SMM, author = "Jesper Eskilson and Mats Carlsson", title = "{SICStus MT} --- a Multithreaded Execution Environment for {SICStus Prolog}", journal = j-LECT-NOTES-COMP-SCI, volume = "1490", pages = "36--53", year = "1998", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Tue Feb 5 11:53:01 MST 2002", bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t1490.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer-ny.com/link/service/series/0558/bibs/1490/14900036.htm; http://link.springer-ny.com/link/service/series/0558/papers/1490/14900036.pdf", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Frigo:1998:ICM, author = "Matteo Frigo and Charles E. Leiserson and Keith H. Randall", title = "The Implementation of the {Cilk-5} Multithreaded Language", journal = j-SIGPLAN, volume = "33", number = "5", pages = "212--223", month = may, year = "1998", CODEN = "SINODQ", ISBN = "0-89791-987-4", ISBN-13 = "978-0-89791-987-6", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:17:47 MST 2003", bibsource = "http://www.acm.org/pubs/contents/proceedings/pldi/277650/index.html; http://www.cs.virginia.edu/pldi98/program.html; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/proceedings/pldi/277650/p212-frigo/", acknowledgement = ack-nhfb, annote = "Published as part of the Proceedings of PLDI'98.", fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "algorithms; languages; performance", subject = "{\bf D.3.2} Software, PROGRAMMING LANGUAGES, Language Classifications, Concurrent, distributed, and parallel languages. {\bf D.1.3} Software, PROGRAMMING TECHNIQUES, Concurrent Programming, Parallel programming. {\bf D.3.3} Software, PROGRAMMING LANGUAGES, Language Constructs and Features, Control structures. {\bf D.3.2} Software, PROGRAMMING LANGUAGES, Language Classifications, C.", } @Article{Geary:1998:SM, author = "David Geary", title = "{Swing} and multithreading", journal = j-JAVA-REPORT, volume = "3", number = "??", pages = "??--??", month = nov, year = "1998", CODEN = "JREPFI", ISSN = "1086-4660", bibdate = "Sat Dec 26 13:52:53 1998", bibsource = "http://archive.javareport.com/9811/html/from_pages/index.shtml; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://archive.javareport.com/9811/html/from_pages/ftp_col1.shtml", abstract = "Read about why Swing is not thread-safe and the ramifications of a single-threaded design for developers using Swing.", acknowledgement = ack-nhfb, } @Article{Girkar:1998:IIM, author = "Milind Girkar and Mohammad R. Haghighat and Paul Grey and Hideki Saito and Nicholas Stavrakos and Constantine D. Polychronopoulos", title = "{Illinois-Intel} Multithreading Library: Multithreading Support for {Intel} Architecture Based Multiprocessor Systems", journal = j-INTEL-TECH-J, number = "Q1", pages = "15", year = "1998", ISSN = "1535-766X", bibdate = "Fri Jun 01 06:02:08 2001", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://developer.intel.com/technology/itj/q11998/articles/art_5.htm; http://developer.intel.com/technology/itj/q11998/pdf/iml.pdf", acknowledgement = ack-nhfb, } @Article{Golla:1998:CEB, author = "Prasad N. Golla and Eric C. Lin", title = "A comparison of the effect of branch prediction on multithreaded and scalar architectures", journal = j-COMP-ARCH-NEWS, volume = "26", number = "4", pages = "3--11", month = sep, year = "1998", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1216475.1216476", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 12:06:40 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Speculative instructions execution requires dynamic branch predictors to increase the performance of a processor by executing from predicted branch target routines. Conventional Scalar architectures such as the Superscalar or Multiscalar architecture executes from a single stream, while a Multithreaded architecture executes from multiple streams at a time. Several aggressive branch predictors have been proposed with high prediction accuracies. Unfortunately, none of the branch predictors can provide 100\% accuracy. Therefore, there is an inherent limitation on speculative execution in real implementation. In this paper, we show that Multithreaded architecture is a better candidate for utilizing speculative execution than Scalar architectures. Generally the branch prediction performance degradation is compounded for larger window sizes on Scalar architectures, while for a Multithreaded architecture, by increasing the number of executing threads, we could sustain a higher performance for a large aggregated speculative window size. Hence, heavier workloads may increase performance and utilization for Multithreaded architectures. We present analytical and simulation results to support our argument.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @TechReport{Golla:1998:CMR, author = "Prasad N. Golla and Eric C. Lin", title = "Cache memory requirements for multithreaded uniprocessor architecture", type = "Technical paper", number = "98-CSE-03", institution = "Dept. of Computer Science and Engineering, Southern Methodist University", address = "Dallas, TX, USA", pages = "32", year = "1998", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Gomez:1998:CAM, author = "J. C. Gomez and E. Mascarenhas and V. Rego", title = "The {CLAM} Approach to Multithreaded Communication on Shared Memory Multiprocessors: Design and Experiments", journal = j-IEEE-TRANS-PAR-DIST-SYS, volume = "9", number = "1", pages = "36--49", month = jan, year = "1998", CODEN = "ITDSEO", DOI = "https://doi.org/10.1109/71.655241", ISSN = "1045-9219 (print), 1558-2183 (electronic)", ISSN-L = "1045-9219", bibdate = "Fri Nov 6 12:31:15 MST 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://dlib.computer.org/td/books/td1998/pdf/l0036.pdf; http://www.computer.org/tpds/td1998/l0036abs.htm", acknowledgement = ack-nhfb, classification = "B6150M (Protocols); B6210L (Computer communications); C5440 (Multiprocessing systems); C5640 (Protocols); C5670 (Network performance)", corpsource = "Dept. of Comput. Sci., Purdue Univ., West Lafayette, IN, USA", fjournal = "IEEE Transactions on Parallel and Distributed Systems", journal-URL = "http://www.computer.org/tpds/archives.htm", keywords = "CLAM approach; communications environment; message passing; multithreaded communication; OS-level process; performance evaluation; protocols; scalable multiprotocol support; scheduling algorithms; shared memory systems; shared-memory multiprocessors; user-space protocols", treatment = "A Application; P Practical", } @Article{Gruen:1998:NIS, author = "T. Gruen and M. A. Hillebrand", title = "{NAS} Integer Sort on Multi-threaded Shared Memory Machines", journal = j-LECT-NOTES-COMP-SCI, volume = "1470", pages = "999--??", year = "1998", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Sat Oct 10 14:40:24 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Heber:1998:UMA, author = "G. Heber and R. Biswas and P. Thulasiraman and G. R. Gao", title = "Using Multithreading for the Automatic Load Balancing of Adaptive Finite Element Meshes", journal = j-LECT-NOTES-COMP-SCI, volume = "1457", pages = "132--??", year = "1998", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Sat Oct 10 14:40:24 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Holub:1998:PJTa, author = "Allen Holub", title = "Programming {Java} threads in the real world: Threading Architectures", journal = j-JAVAWORLD, volume = "3", number = "9", pages = "??--??", month = sep, year = "1998", CODEN = "????", ISSN = "1091-8906", bibdate = "Thu Sep 10 14:37:36 MDT 1998", bibsource = "http://www.javaworld.com/javaworld/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.holub.com/goodies/javaworld/jw_index.html; http://www.javaworld.com/javaworld/jw-09-1998/jw-09-threads.htm", acknowledgement = ack-nhfb, } @Article{Holub:1998:PJTb, author = "Allen Holub", title = "Programming {Java} threads in the real world, {Part} 2: Common multithreading Pitfalls (Deadlock, etc.)", journal = j-JAVAWORLD, volume = "3", number = "10", pages = "??--??", year = "1998", CODEN = "????", ISSN = "1091-8906", bibdate = "Mon Jan 4 06:11:43 MST 1999", bibsource = "http://www.javaworld.com/javaworld/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.holub.com/goodies/javaworld/jw_index.html; http://www.javaworld.com/javaworld/jw-10-1998/jw-10-toolbox.htm", acknowledgement = ack-nhfb, } @Article{Holub:1998:PJTc, author = "Allen Holub", title = "Programming {Java} threads in the real world, {Part} 3: Semaphore, Lock\_manager, and Mutex", journal = j-JAVAWORLD, volume = "3", number = "11", pages = "??--??", year = "1998", CODEN = "????", ISSN = "1091-8906", bibdate = "Mon Jan 4 06:11:43 MST 1999", bibsource = "http://www.javaworld.com/javaworld/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.holub.com/goodies/javaworld/jw_index.html; http://www.javaworld.com/javaworld/jw-11-1998/jw-11-toolbox.htm", acknowledgement = ack-nhfb, } @Article{Holub:1998:PJTd, author = "Allen Holub", title = "Programming {Java} threads in the real world, {Part} 4: Condition Variables and Counting Semaphores", journal = j-JAVAWORLD, volume = "3", number = "12", pages = "??--??", year = "1998", CODEN = "????", ISSN = "1091-8906", bibdate = "Mon Jan 4 06:22:03 MST 1999", bibsource = "http://www.javaworld.com/javaworld/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.holub.com/goodies/javaworld/jw_index.html; http://www.javaworld.com/javaworld/jw-12-1998/jw-12-toolbox.htm", acknowledgement = ack-nhfb, } @PhdThesis{Hopper:1998:CFM, author = "Michael A. Hopper", title = "A compiler framework for multithreaded parallel systems", type = "Thesis ({Ph.D.})", school = "School of Electrical and Computer Engineering, Georgia Institute of Technology", address = "Atlanta, GA, USA", pages = "xii + 110", year = "1998", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, annote = "Directed by William Appelbe.", keywords = "Compilers (Computer programs); Parallel processing (Electronic computers)", } @Article{Howes:1998:TPC, author = "Brad Howes", title = "Template processing classes for {Python}", journal = j-DDJ, volume = "23", number = "2", pages = "38, 40, 42, 44--46, 48, 100", month = feb, year = "1998", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Thu May 21 19:02:04 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/dr-dobbs.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Brad shows how you can embed Python objects in HTML pages using boilerplate template processing classes. Then Python creator Guido van Rossum adds a note on what's new in the just-released Python 1.5.", acknowledgement = ack-nhfb, classification = "C6130D (Document processing techniques); C6130M (Multimedia); C6160J (Object- oriented databases)", fjournal = "Dr. Dobb's Journal of Software Tools", keywords = "application program interfaces; BoilerPlate; CGI infrastructure; conditional control; Emacs; embedded HTML text; errors; HTML document template; HTML editing; hypermedia; iterative control; multithreaded CGI service; object database; object paradigm; object-oriented databases; page description languages; persistent objects; placeholders; print statements; Python; run- time values; run-time HTML generation; syntax coloring; tagged locations; template HTML constructs; template processing classes; text regions", treatment = "P Practical", } @Article{Itzkovitz:1998:TMA, author = "Ayal Itzkovitz and Assaf Schuster and Lea Shalev", title = "Thread migration and its applications in distributed shared memory systems", journal = j-J-SYST-SOFTW, volume = "42", number = "1", pages = "71--87", month = jul, year = "1998", CODEN = "JSSODM", ISSN = "0164-1212 (print), 1873-1228 (electronic)", ISSN-L = "0164-1212", bibdate = "Thu Dec 17 14:07:21 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "The Journal of systems and software", journal-URL = "http://www.sciencedirect.com/science/journal/01641212", } @Article{Ji:1998:PMM, author = "Minwen Ji and Edward W. Felten and Kai Li", title = "Performance measurements for multithreaded programs", journal = j-SIGMETRICS, volume = "26", number = "1", pages = "161--170", month = jun, year = "1998", CODEN = "????", DOI = "https://doi.org/10.1145/277858.277900", ISSN = "0163-5999 (print), 1557-9484 (electronic)", ISSN-L = "0163-5999", bibdate = "Thu Jun 26 11:25:18 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Multithreaded programming is an effective way to exploit concurrency, but it is difficult to debug and tune a highly threaded program. This paper describes a performance tool called Tmon for monitoring, analyzing and tuning the performance of multithreaded programs. The performance tool has two novel features: it uses `thread waiting time' as a measure and constructs thread waiting graphs to show thread dependencies and thus performance bottlenecks, and it identifies `semi-busy-waiting' points where CPU cycles are wasted in condition checking and context switching. We have implemented the Tmon tool and, as a case study, we have used it to measure and tune a heavily threaded file system. We used four workloads to tune different aspects of the file system. We were able to improve the file system bandwidth and throughput significantly. In one case, we were able to improve the bandwidth by two orders of magnitude.", acknowledgement = ack-nhfb, fjournal = "ACM SIGMETRICS Performance Evaluation Review", journal-URL = "http://portal.acm.org/toc.cfm?id=J618", } @InProceedings{Karamcheti:1998:HLB, author = "Vijay Karamcheti and Andrew A. Chien", title = "A Hierarchical Load-Balancing Framework for Dynamic Multithreaded Computations", crossref = "ACM:1998:SHP", pages = "??--??", year = "1998", bibdate = "Wed Mar 06 06:31:50 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.supercomp.org/sc98/papers/", URL = "http://www.supercomp.org/sc98/TechPapers/sc98_FullAbstracts/Karamcheti553/index.htm", acknowledgement = ack-nhfb, } @Article{Keckler:1998:EFG, author = "Stephen W. Keckler and William J. Dally and Daniel Maskit and Nicholas P. Carter and Andrew Chang and Whay S. Lee", title = "Exploiting fine-grain thread level parallelism on the {MIT} multi-{ALU} processor", journal = j-COMP-ARCH-NEWS, volume = "26", number = "3", pages = "306--317", month = jun, year = "1998", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:58 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Krinke:1998:SST, author = "Jens Krinke", title = "Static Slicing of Threaded Programs", journal = j-SIGPLAN, volume = "33", number = "7", pages = "35--42", month = jul, year = "1998", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:17:49 MST 2003", bibsource = "Compendex database; http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Static program slicing is an established method for analyzing sequential programs, especially for program understanding, debugging and testing. Until now, there was no slicing method for threaded programs which handles interference correctly. We present such a method which also calculates more precise static slices. This paper extends the well known structures of the control flow graph and the program dependence graph for threaded programs with interference. This new technique does not require serialization of threaded programs.", acknowledgement = ack-nhfb, affiliation = "Technische Universitaet Braunschweig", affiliationaddress = "Braunschweig, Ger", classification = "723; 723.1; 723.2; 723.5", conference = "Proceedings of the 1998 ACM SIGPLAN\slash SIGSOFT Workshop on Program Analysis for Software Tools and Engineering", fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", journalabr = "ACM SIGPLAN SIGSOFT Workshop Program Anal Software Tools Eng", keywords = "Computer aided software engineering; Computer software selection and evaluation; Control flow graphs; Data flow analysis; Data structures; Program debugging; Static program slicing; Threaded programs", meetingaddress = "Montreal, Can", meetingdate = "Jun 16 1998", meetingdate2 = "06/16/98", sponsor = "ACM", } @Article{Krone:1998:LBN, author = "O. Krone and M. Raab and B. Hirsbrunner", title = "Load Balancing for Network Based Multi-threaded Applications", journal = j-LECT-NOTES-COMP-SCI, volume = "1497", pages = "206--??", year = "1998", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Tue Jan 5 08:21:58 MST 1999", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Book{Lewis:1998:MPP, author = "Bil Lewis and Daniel J. Berg", title = "Multithreaded programming with pthreads", publisher = pub-SUN, address = pub-SUN:adr, pages = "xxx + 382", year = "1998", ISBN = "0-13-680729-1 (paperback)", ISBN-13 = "978-0-13-680729-2 (paperback)", LCCN = "QA76.76.T55 L49 1998", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.amazon.com/exec/obidos/ASIN/0136807291/ref=sim_books/002-4892305-5599452; http://www.sun.com/books/catalog/lewis2/index.html", acknowledgement = ack-nhfb, alttitle = "Pthreads", keywords = "POSIX (Computer software standard); Threads (Computer programs); UNIX (Computer file)", } @Article{Lo:1998:ADW, author = "Jack L. Lo and Luiz Andr{\'e} Barroso and Susan J. Eggers and Kourosh Gharachorloo and Henry M. Levy and Sujay S. Parekh", title = "An analysis of database workload performance on simultaneous multithreaded processors", journal = j-COMP-ARCH-NEWS, volume = "26", number = "3", pages = "39--50", month = jun, year = "1998", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:58 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @InProceedings{Lu:1998:ONW, author = "Honghui Lu", title = "{OpenMP} on Networks of Workstations", crossref = "ACM:1998:SHP", pages = "??--??", year = "1998", bibdate = "Wed Oct 07 08:50:26 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.supercomp.org/sc98/papers/", acknowledgement = ack-nhfb, } @Article{Manley:1998:GPT, author = "Kevin T. Manley", title = "General-Purpose Threads with {I/O} Completion Ports", journal = j-CCCUJ, volume = "16", number = "4", pages = "??--??", month = apr, year = "1998", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Tue May 14 18:09:15 MDT 2002", bibsource = "http://www.cuj.com/articles/1998/9804/9804toc.htm?topic=articles; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Divide and conquer is a good strategy for partitioning a large job, provided you don't divide too much. Windows NT helps you guess right.", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @Article{Mascarenhas:1998:MTP, author = "Edward Mascarenhas and Vernon Rego", title = "Migrant threads on process farms: parallel programming with {Ariadne}", journal = j-CPE, volume = "10", number = "9", pages = "673--698", day = "10", month = aug, year = "1998", CODEN = "CPEXEI", ISSN = "1040-3108", ISSN-L = "1040-3108", bibdate = "Tue Sep 7 06:06:42 MDT 1999", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www3.interscience.wiley.com/journalfinder.html", URL = "http://www3.interscience.wiley.com/cgi-bin/abstract?ID=10008703; http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=10008703&PLACEBO=IE.pdf", acknowledgement = ack-nhfb, fjournal = "Concurrency, practice and experience", } @Article{McManis:1998:DUT, author = "Chuck McManis", title = "In Depth: Using threads with collections, {Part 1}", journal = j-JAVAWORLD, volume = "3", number = "3", pages = "??--??", month = mar, year = "1998", CODEN = "????", ISSN = "1091-8906", bibdate = "Thu Aug 13 08:48:26 MDT 1998", bibsource = "http://www.javaworld.com/javaworld/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.javaworld.com/javaworld/jw-03-1998/jw-03-indepth.html", acknowledgement = ack-nhfb, } @Article{McManis:1998:JDU, author = "Chuck McManis", title = "{Java} In Depth: Using threads with collections, part 2", journal = j-JAVAWORLD, volume = "3", number = "6", pages = "??--??", month = jun, year = "1998", CODEN = "????", ISSN = "1091-8906", bibdate = "Thu Aug 13 08:48:26 MDT 1998", bibsource = "http://www.javaworld.com/javaworld/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.javaworld.com/javaworld/jw-06-1998/jw-06-indepth.html", acknowledgement = ack-nhfb, } @Article{Nebro:1998:EMR, author = "A. J. Nebro and E. Pimentel and J. M. Troya", title = "Evaluating a Multithreaded Runtime System for Concurrent Object-Oriented Languages", journal = j-LECT-NOTES-COMP-SCI, volume = "1505", pages = "167--??", year = "1998", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Tue Jan 5 08:21:58 MST 1999", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Book{Nichols:1998:PP, author = "Bradford Nichols and Dick Buttlar and Jacqueline Proulx Farrell", title = "Pthreads programming", publisher = pub-ORA, address = pub-ORA:adr, pages = "xvi + 267", year = "1998", ISBN = "1-56592-115-1", ISBN-13 = "978-1-56592-115-3", LCCN = "QA76.642 .N53 1998", bibdate = "Fri May 10 12:18:17 MDT 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = "Nutshell handbook", acknowledgement = ack-nhfb, annote = "A POSIX standard for better multiprocessing.", keywords = "compilers (computer programs); parallel programming (computer science)", } @Article{Piumarta:1998:ODT, author = "Ian Piumarta and Fabio Riccardi", title = "Optimizing Direct-threaded Code by Selective Inlining", journal = j-SIGPLAN, volume = "33", number = "5", pages = "291--300", month = may, year = "1998", CODEN = "SINODQ", ISBN = "0-89791-987-4", ISBN-13 = "978-0-89791-987-6", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:17:47 MST 2003", bibsource = "http://www.acm.org/pubs/contents/proceedings/pldi/277650/index.html; http://www.cs.virginia.edu/pldi98/program.html; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/proceedings/pldi/277650/p291-piumarta/", acknowledgement = ack-nhfb, annote = "Published as part of the Proceedings of PLDI'98.", fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "algorithms; experimentation; languages; performance", subject = "{\bf D.3.4} Software, PROGRAMMING LANGUAGES, Processors, Optimization. {\bf D.3.4} Software, PROGRAMMING LANGUAGES, Processors, Interpreters. {\bf D.3.4} Software, PROGRAMMING LANGUAGES, Processors, Translator writing systems and compiler generators.", } @Article{Plauger:1998:SCCl, author = "P. J. Plauger", title = "{Standard C/C++}: Thread Safety", journal = j-CCCUJ, volume = "16", number = "12", pages = "??--??", month = dec, year = "1998", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Tue May 14 18:09:18 MDT 2002", bibsource = "http://www.cuj.com/articles/1998/9812/9812toc.htm?topic=articles; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The C++ Standard doesn't talk about thread safety, but everyone else does.", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @Article{Pomerantz:1998:CNS, author = "Dave Pomerantz", title = "{C++} Notifiers: Simplifying system development", journal = j-DDJ, volume = "23", number = "8", pages = "26, 28, 30--31, 89--90", month = aug, year = "1998", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Thu Jul 16 13:01:59 MDT 1998", bibsource = "http://www.ddj.com/ddj/1998/1998_08/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.ddj.com/ftp/1998/1998_08/notifier.txt; http://www.ddj.com/ftp/1998/1998_08/notifier.zip", abstract = "Notifiers, also called ``events'' or ``messages,'' are used to pass information anonymously between objects. Dave shows how notifiers can work in C++, using a multithreaded application as an example.", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @Article{Reck:1998:TSR, author = "Bill Reck", title = "Thread Synchronization with Reference-Counting Handles", journal = j-CCCUJ, volume = "16", number = "2", pages = "??--??", month = feb, year = "1998", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Tue May 14 18:09:14 MDT 2002", bibsource = "http://www.cuj.com/articles/1998/9802/9802toc.htm?topic=articles; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Often, the best time to protect access to a shared object is right when you reach for it.", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @Article{Reus:1998:VCO, author = "B. Reus and A. Knapp and P. Cenciarelli and M. Wirsing", title = "Verifying a compiler optimization for Multi-Threaded {Java}", journal = j-LECT-NOTES-COMP-SCI, volume = "1376", pages = "402--??", year = "1998", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Sat Oct 10 14:40:24 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/java.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Saghi:1998:MSH, author = "Gene Saghi and Kirk Reinholtz and Paul A. Savory", title = "A Multithreaded Scheduler for a High-speed Spacecraft Simulator", journal = j-SPE, volume = "28", number = "6", pages = "641--656", month = may, year = "1998", CODEN = "SPEXBL", ISSN = "0038-0644 (print), 1097-024X (electronic)", ISSN-L = "0038-0644", bibdate = "Thu Jul 29 15:11:48 MDT 1999", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/spe.bib; http://www3.interscience.wiley.com/journalfinder.html", URL = "http://www3.interscience.wiley.com/cgi-bin/abstract?ID=1802; http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=1802&PLACEBO=IE.pdf", acknowledgement = ack-nhfb, fjournal = "Software --- Practice and Experience", journal-URL = "http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1097-024X", } @Article{Schmidt:1998:EAM, author = "Douglas C. Schmidt", title = "Evaluating architectures for multithreaded object request brokers", journal = j-CACM, volume = "41", number = "10", pages = "54--60", month = oct, year = "1998", CODEN = "CACMA2", ISSN = "0001-0782 (print), 1557-7317 (electronic)", ISSN-L = "0001-0782", bibdate = "Tue Oct 6 21:15:42 MDT 1998", bibsource = "http://www.acm.org/pubs/toc/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/journals/cacm/1998-41-10/p54-schmidt/", acknowledgement = ack-nhfb, fjournal = "Communications of the ACM", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J79", } @Article{Seiden:1998:ROM, author = "S. S. Seiden", title = "Randomized Online Multi-threaded Paging", journal = j-LECT-NOTES-COMP-SCI, volume = "1432", pages = "264--??", year = "1998", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Sat Oct 10 14:40:24 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Shaw:1998:CIP, author = "Andrew Shaw and Arvind and Kyoo-Chan Cho and Christopher Hill and R. Paul Johnson and John Marshall", title = "A Comparison of Implicitly Parallel Multithreaded and Data-Parallel Implementations of an Ocean Model", journal = j-J-PAR-DIST-COMP, volume = "48", number = "1", pages = "1--51", day = "10", month = jan, year = "1998", CODEN = "JPDCER", DOI = "https://doi.org/10.1006/jpdc.1997.1390", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:19:04 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1997.1390/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1997.1390/production/pdf; http://www.idealibrary.com/links/doi/10.1006/jpdc.1997.1390/production/ref", acknowledgement = ack-nhfb, fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315", } @PhdThesis{Shaw:1998:CPM, author = "Andrew Shaw", title = "Compiling for parallel multithreaded computation on symmetric multiprocessors", type = "Thesis ({Ph.D.})", school = "Massachusetts Institute of Technology, Department of Electrical Engineering and Computer Science", address = "Cambridge, MA, USA", pages = "149", year = "1998", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Shene:1998:MPI, author = "Chin-Kuang Shene", title = "Multithreaded programming in an introduction to operating systems course", journal = j-SIGCSE, volume = "30", number = "1", pages = "242--246", month = mar, year = "1998", CODEN = "SIGSD3", DOI = "https://doi.org/10.1145/274790.274305", ISSN = "0097-8418 (print), 2331-3927 (electronic)", ISSN-L = "0097-8418", bibdate = "Sat Nov 17 16:56:29 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigcse1990.bib", abstract = "This paper presents a way of teaching multithreaded programming as a component in an introduction to operating systems course. Topics include programming assignments, term projects, and experiences. This paper also suggests future work for overcoming a bottleneck that occurs in the current version of this course.", acknowledgement = ack-nhfb, fjournal = "SIGCSE Bulletin (ACM Special Interest Group on Computer Science Education)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J688", } @Article{Silc:1998:APC, author = "J. Silc and B. Robic and T. Ungerer", title = "Asynchrony in Parallel Computing: From Dataflow to Multithreading", journal = j-PARALLEL-DIST-COMP-PRACT, volume = "1", number = "1", pages = "??--??", month = "????", year = "1998", CODEN = "????", ISSN = "1097-2803", bibdate = "Fri Dec 19 08:14:11 MST 2003", bibsource = "http://www.cs.okstate.edu/~pdcp/vols/vol01/vol01no1.html; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.cs.okstate.edu/~pdcp/vols/vol01/vol01no1abs.html#silc", acknowledgement = ack-nhfb, fjournal = "PDCP: Parallel and Distributed Computing Practices", } @Article{Skillicorn:1998:MLP, author = "David B. Skillicorn and Domenico Talia", title = "Models and languages for parallel computation", journal = j-COMP-SURV, volume = "30", number = "2", pages = "123--169", month = jun, year = "1998", CODEN = "CMSVAN", ISSN = "0360-0300 (print), 1557-7341 (electronic)", ISSN-L = "0360-0300", bibdate = "Fri Sep 11 08:35:51 MDT 1998", bibsource = "http://www.acm.org/pubs/contents/journals/surveys/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/journals/surveys/1998-30-2/p123-skillicorn/", abstract = "We survey parallel programming models and languages using six criteria to assess their suitability for realistic portable parallel programming. We argue that an ideal model should by easy to program, should have a software development methodology, should be architecture-independent, should be easy to understand, should guarantee performance, and should provide accurate information about the cost of programs. These criteria reflect our belief that developments in parallelism must be driven by a parallel software industry based on portability and efficiency. We consider programming models in six categories, depending on the level of abstraction they provide. Those that are very abstract conceal even the presence of parallelism at the software level. Such models make software easy to build and port, but efficient and predictable performance is usually hard to achieve. At the other end of the spectrum, low-level models make all of the messy issues of parallel programming explicit (how many threads, how to place them, how to express communication, and how to schedule communication), so that software is hard to build and not very portable, but is usually efficient. Most recent models are near the center of this spectrum, exploring the best tradeoffs between expressiveness and performance. A few models have achieved both abstractness and efficiency. Both kinds of models raise the possibility of parallelism as part of the mainstream of computing.", acknowledgement = ack-nhfb, fjournal = "ACM Computing Surveys", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J204", keywords = "languages; performance; theory", subject = "{\bf C.4} Computer Systems Organization, PERFORMANCE OF SYSTEMS. {\bf D.1} Software, PROGRAMMING TECHNIQUES. {\bf D.3.2} Software, PROGRAMMING LANGUAGES, Language Classifications.", } @InProceedings{Smith:1998:SIF, author = "Geoffrey Smith and Dennis Volpano", title = "Secure information flow in a multi-threaded imperative language", crossref = "ACM:1998:CRP", pages = "355--364", year = "1998", bibdate = "Mon May 3 12:57:52 MDT 1999", bibsource = "http://www.acm.org/pubs/toc/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/proceedings/plan/268946/p355-smith/", acknowledgement = ack-nhfb, keywords = "algorithms; languages; security; theory", subject = "{\bf F.3.3} Theory of Computation, LOGICS AND MEANINGS OF PROGRAMS, Studies of Program Constructs, Type structure. {\bf D.3.0} Software, PROGRAMMING LANGUAGES, General. {\bf D.2.0} Software, SOFTWARE ENGINEERING, General, Protection mechanisms. {\bf D.1.3} Software, PROGRAMMING TECHNIQUES, Concurrent Programming.", } @Article{Tennberg:1998:CAD, author = "Patrick Tennberg", title = "Creating Active Data Types via Multithreading", journal = j-CCCUJ, volume = "16", number = "1", pages = "??--??", month = jan, year = "1998", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Tue May 14 18:09:13 MDT 2002", bibsource = "http://www.cuj.com/articles/1998/9801/9801toc.htm?topic=articles; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "If you need multiple active agents in a program, you need multiple threads to synchronize them.", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @Article{Thitikamol:1998:PNM, author = "K. Thitikamol and P. Keleher", title = "Per-node multithreading and remote latency", journal = j-IEEE-TRANS-COMPUT, volume = "47", number = "4", pages = "414--426", month = apr, year = "1998", CODEN = "ITCOB4", DOI = "https://doi.org/10.1109/12.675711", ISSN = "0018-9340 (print), 1557-9956 (electronic)", ISSN-L = "0018-9340", bibdate = "Wed Jul 6 09:35:54 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput1990.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=675711", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Computers", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12", } @InProceedings{Thornley:1998:SSH, author = "John Thornley and K. Mani Chandy and Hiroshi Ishii", title = "A System for Structured High-Performance Multithreaded Programming in {Windows NT}", crossref = "USENIX:1998:PUWa", pages = "??--??", year = "1998", bibdate = "Fri Oct 18 07:49:55 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.usenix.org/publications/library/proceedings/usenix-nt98/thornley.html; http://www.usenix.org/publications/library/proceedings/usenix-nt98/thornley_slides", acknowledgement = ack-nhfb, } @Article{Tsai:1998:POC, author = "J.-Y. Tsai and Z. Jiang and P.-C. Yew", title = "Program Optimization for Concurrent Multithreaded Architectures", journal = j-LECT-NOTES-COMP-SCI, volume = "1366", pages = "146--??", year = "1998", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Sat Oct 10 14:40:24 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @InProceedings{Tullsen:1998:RSM, author = "Dean M. Tullsen and Susan J. Eggers and Henry M. Levy", title = "Retrospective: {Simultaneous} multithreading: maximizing on-chip parallelism", crossref = "ACM:1998:PAI", pages = "115--116", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Tullsen:1998:SMM, author = "Dean M. Tullsen and Susan J. Eggers and Henry M. Levy", title = "Simultaneous multithreading: maximizing on-chip parallelism", crossref = "ACM:1998:PAI", pages = "533--544", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @Article{Venners:1998:DTS, author = "Bill Venners", title = "Design for thread safety", journal = j-JAVAWORLD, volume = "3", number = "8", pages = "??--??", month = aug, year = "1998", CODEN = "????", ISSN = "1091-8906", bibdate = "Thu Sep 10 14:37:30 MDT 1998", bibsource = "http://www.javaworld.com/javaworld/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.javaworld.com/javaworld/jw-08-1998/jw-08-techniques.htm", acknowledgement = ack-nhfb, } @InProceedings{Vishkin:1998:EMT, author = "Uzi Vishkin and Shlomit Dascal and Efraim Berkovich and Joseph Nuzman", booktitle = "SPAA '98: 10th Annual ACM Symposium on Parallel Algorithms and Architectures, June 28--July 2, 1998, Puerto Vallarta, Mexico", title = "Explicit multi-threading ({XMT}) bridging models for instruction parallelism (extended abstract)", publisher = pub-ACM, address = pub-ACM:adr, year = "1998", DOI = "https://doi.org/10.1145.277680", ISBN = "0-89791-989-0", ISBN-13 = "978-0-89791-989-0", LCCN = "QA76.58 .A26 1998", bibdate = "Fri Jul 27 05:37:45 2001", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "ACM order number 417980.", URL = "http://delivery.acm.org/10.1145/280000/277680/p140-vishkin.pdf", acknowledgement = ack-nhfb, bookpages = "viii + 310", keywords = "IA-64", } @Article{Wallace:1998:TMP, author = "Steven Wallace and Brad Calder and Dean M. Tullsen", title = "Threaded multiple path execution", journal = j-COMP-ARCH-NEWS, volume = "26", number = "3", pages = "238--249", month = jun, year = "1998", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:58 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @PhdThesis{Weissman:1998:ATT, author = "Boris Weissman", title = "Active threads: towards efficient fine-grained parallelism in object-oriented systems", type = "Thesis ({Ph.D. in Computer Science})", school = "Department of Computer Science, University of California, Berkeley", address = "Berkeley, CA, USA", year = "1998", LCCN = "T7.6.1998 W457", bibdate = "Fri May 10 12:18:17 MDT 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "dissertations, academic -- UCB -- Computer Science -- 1991--2000; University of California, Berkeley, Dept. Of Computer Science -- dissertations", } @Article{Weissman:1998:PCS, author = "Boris Weissman", title = "Performance Counters and State Sharing Annotations: a Unified Approach to Thread Locality", journal = j-SIGPLAN, volume = "33", number = "11", pages = "127--138", month = nov, year = "1998", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:17:54 MST 2003", bibsource = "http://portal.acm.org/; http://www.acm.org/pubs/toc/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "Co-published in {\em Operating Systems Review}, {\bf 32}(5).", URL = "http://www.acm.org:80/pubs/citations/proceedings/asplos/291069/p127-weissman/", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "design; experimentation; measurement; performance; theory", subject = "{\bf D.4.1} Software, OPERATING SYSTEMS, Process Management, Scheduling. {\bf F.1.2} Theory of Computation, COMPUTATION BY ABSTRACT DEVICES, Modes of Computation, Parallelism and concurrency. {\bf D.4.8} Software, OPERATING SYSTEMS, Performance, Simulation. {\bf G.3} Mathematics of Computing, PROBABILITY AND STATISTICS, Markov processes.", } @Article{Wilde:1998:RES, author = "Norman Wilde and Christopher Casey and Joe Vandeville and Gary Trio and Dick Hotz", title = "Reverse engineering of software threads: a design recovery technique for large multi-process systems", journal = j-J-SYST-SOFTW, volume = "43", number = "1", pages = "11--17", month = oct, year = "1998", CODEN = "JSSODM", ISSN = "0164-1212 (print), 1873-1228 (electronic)", ISSN-L = "0164-1212", bibdate = "Wed Dec 16 08:24:49 MST 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "The Journal of systems and software", journal-URL = "http://www.sciencedirect.com/science/journal/01641212", } @Article{Wilmot:1998:DTM, author = "Dick Wilmot", title = "Data threaded microarchitecture", journal = j-COMP-ARCH-NEWS, volume = "26", number = "5", pages = "22--32", month = dec, year = "1998", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:21 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Zhou:1998:LST, author = "Honbo Zhou and Al Geist", title = "{LPVM}: a step towards multithread {PVM}", journal = j-CPE, volume = "10", number = "5", pages = "407--416", day = "25", month = apr, year = "1998", CODEN = "CPEXEI", ISSN = "1040-3108", ISSN-L = "1040-3108", bibdate = "Tue Sep 7 06:06:40 MDT 1999", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www3.interscience.wiley.com/journalfinder.html", URL = "http://www3.interscience.wiley.com/cgi-bin/abstract?ID=5385; http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=5385&PLACEBO=IE.pdf", acknowledgement = ack-nhfb, fjournal = "Concurrency, practice and experience", } @Article{Anonymous:1999:BST, author = "Anonymous", title = "Bookshelf: Surviving the Top Ten Challenges of Software Development; The {Year 2000} Crisis; The Continuing Challenge; Software Project Survival Guide; Object-Oriented Multithreading Using {C++}", journal = j-IEEE-SOFTWARE, volume = "16", number = "1", pages = "114--??", month = jan # "\slash " # feb, year = "1999", CODEN = "IESOEG", ISSN = "0740-7459 (print), 0740-7459 (electronic)", ISSN-L = "0740-7459", bibdate = "Thu Apr 1 16:52:57 MST 1999", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeesoft.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://dlib.computer.org/so/books/so1999/pdf/s1114.pdf", acknowledgement = ack-nhfb, fjournal = "IEEE Software", journal-URL = "http://www.computer.org/portal/web/csdl/magazines/software", } @Article{Antoniu:1999:ETT, author = "G. Antoniu and L. Bouge and R. Namyst", title = "An Efficient and Transparent Thread Migration Scheme in the {PM2} Runtime System", journal = j-LECT-NOTES-COMP-SCI, volume = "1586", pages = "496--??", year = "1999", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Mon Sep 13 16:57:02 MDT 1999", bibsource = "https://www.math.utah.edu/pub/tex/bib/lncs1999a.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Azagury:1999:NIR, author = "Alain Azagury and Elliot K. Kolodner and Erez Petrank", title = "A Note on the Implementation of Replication-Based Garbage Collection for Multithreaded Applications and Multiprocessor Environments", journal = j-PARALLEL-PROCESS-LETT, volume = "9", number = "3", pages = "391--??", month = sep, year = "1999", CODEN = "PPLTEE", ISSN = "0129-6264 (print), 1793-642X (electronic)", bibdate = "Thu Jan 6 12:02:35 MST 2005", bibsource = "http://ejournals.wspc.com.sg/ppl/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Parallel Processing Letters", journal-URL = "http://www.worldscientific.com/loi/ppl", } @Article{Blumofe:1999:SMC, author = "Robert D. Blumofe and Charles E. Leiserson", title = "Scheduling multithreaded computations by work stealing", journal = j-J-ACM, volume = "46", number = "5", pages = "720--748", month = sep, year = "1999", CODEN = "JACOAH", ISSN = "0004-5411 (print), 1557-735X (electronic)", ISSN-L = "0004-5411", bibdate = "Sun Jan 23 12:19:49 MST 2000", bibsource = "http://www.acm.org/pubs/toc/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org/pubs/citations/journals/jacm/1999-46-5/p720-blumofe/", acknowledgement = ack-nhfb, fjournal = "Journal of the ACM", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J401", } @Article{Bouge:1999:ECM, author = "L. Bouge and J.-F. Mehaut and R. Namyst", title = "Efficient Communications in Multithreaded Runtime Systems", journal = j-LECT-NOTES-COMP-SCI, volume = "1586", pages = "468--482", year = "1999", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Fri Mar 16 07:33:54 2001", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Broadman:1999:ECM, author = "Allen Broadman and Eric Shaw", title = "Executing a Class Member in Its Own Thread", journal = j-CCCUJ, volume = "17", number = "12", pages = "??--??", month = dec, year = "1999", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Tue May 14 18:09:24 MDT 2002", bibsource = "http://www.cuj.com/articles/1999/9912/9912toc.htm?topic=articles; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Creating a separate thread to execute a member function call is a messy business that's often necessary. It's a task well worth encapsulating.", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @Article{Cappello:1999:PNB, author = "F. Cappello and O. Richard and D. Etiemble", title = "Performance of the {NAS} Benchmarks on a Cluster of {SMP PCs} Using a Parallelization of the {MPI} Programs with {OpenMP}", journal = j-LECT-NOTES-COMP-SCI, volume = "1662", pages = "339--350", year = "1999", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Mon Sep 13 16:57:02 MDT 1999", bibsource = "https://www.math.utah.edu/pub/tex/bib/lncs1999b.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/pvm.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Cenciarelli:1999:EBS, author = "P. Cenciarelli and A. Knapp and B. Reus and M. Wirsing", title = "An Event-Based Structural Operational Semantics of Multi-Threaded {Java}", journal = j-LECT-NOTES-COMP-SCI, volume = "1523", pages = "157--??", year = "1999", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Mon Sep 13 16:57:02 MDT 1999", bibsource = "https://www.math.utah.edu/pub/tex/bib/lncs1999a.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Chappell:1999:SSM, author = "Robert S. Chappell and Jared Stark and Sangwook P. Kim and Steven K. Reinhardt and Yale N. Patt", title = "Simultaneous subordinate microthreading {(SSMT)}", journal = j-COMP-ARCH-NEWS, volume = "27", number = "2", pages = "186--195", month = may, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Dascal:1999:ELR, author = "Shlomit Dascal and Uzi Vishkin", title = "Experiments with List Ranking for Explicit Multi-Threaded {(XMT)} Instruction Parallelism (Extended Abstract)", journal = j-LECT-NOTES-COMP-SCI, volume = "1668", pages = "43--??", year = "1999", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Mon Feb 4 12:03:08 MST 2002", bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t1668.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer-ny.com/link/service/series/0558/bibs/1668/16680043.htm; http://link.springer-ny.com/link/service/series/0558/papers/1668/16680043.pdf", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{delaPuente:1999:RTP, author = "Juan A. de la Puente and Jos{\'e} F. Ruiz and Jes{\'u}s M. Gonz{\'a}lez-Barahona", title = "Real-Time Programming with {GNAT}: Specialized Kernels versus {POSIX} Threads", journal = j-SIGADA-LETTERS, volume = "19", number = "2", pages = "73--77", month = jun, year = "1999", CODEN = "AALEE5", ISSN = "1094-3641 (print), 1557-9476 (electronic)", ISSN-L = "1094-3641", bibdate = "Tue Aug 31 07:04:20 1999", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGADA Ada Letters", } @Article{DeWitt:1999:PTL, author = "Anthony DeWitt and Thomas Gross", title = "The potential of thread-level speculation based on value profiling", journal = j-COMP-ARCH-NEWS, volume = "27", number = "1", pages = "22--22", month = mar, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:35 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Duda:1999:BVT, author = "Kenneth J. Duda and David R. Cheriton", title = "Borrowed-virtual-time {(BVT)} scheduling: supporting latency-sensitive threads in a general-purpose scheduler", journal = j-OPER-SYS-REV, volume = "33", number = "5", pages = "261--276", month = dec, year = "1999", CODEN = "OSRED8", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:55 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @InProceedings{Garcia:1999:MMI, author = "F. Garcia and A. Calderon and J. Carretero", title = "{MiMPI}: a multithread-safe implementation of {MPI}", crossref = "Dongarra:1999:RAP", number = "1697", pages = "207--214", year = "1999", bibdate = "Thu Dec 9 06:08:35 MST 1999", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Greiner:1999:PTE, author = "John Greiner and Guy E. Blelloch", title = "A provably time-efficient parallel implementation of full speculation", journal = j-TOPLAS, volume = "21", number = "2", pages = "240--285", month = mar, year = "1999", CODEN = "ATPSDT", ISSN = "0164-0925 (print), 1558-4593 (electronic)", ISSN-L = "0164-0925", bibdate = "Tue Sep 26 10:12:58 MDT 2000", bibsource = "http://www.acm.org/pubs/contents/journals/toplas/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org/pubs/citations/journals/toplas/1999-21-2/p240-greiner/", abstract = "Speculative evaluation, including leniency and futures, is often used to produce high degrees of parallelism. Understanding the performance characteristics of such evaluation, however, requires having a detailed understanding of the implementation. For example, the particular implementation technique used to suspend and reactivate threads can have an asymptotic effect on performance. With the goal of giving the users some understanding of performance without requiring them to understand the implementation, we present a provable implementation bound for a language based on speculative evaluation. The idea is (1) to supply the users with a semantics for a language that defines abstract costs for measuring or analyzing the performance of computations, (2) to supply the users with a mapping of these costs onto runtimes on various machine models, and (3) to describe an implementation strategy of the language and prove that it meets these mappings. For this purpose we consider a simple language based on speculative evaluation. For every computation, the semantics of the language returns a directed acyclic graph (DAG) in which each node represents a unit of computation, and each edge represents a dependence. We then describe an implementation strategy of the language and show that any computation with $w$ work (the number of nodes in the DAG) and $d$ depth (the length of the longest path in the DAG) will run on a $p$-processor PRAM in $ O(w / p + d \log p) $ time. The bounds are work efficient (within a constant factor of linear speedup) when there is sufficient parallelism, $ w / d p \log p $. These are the first time bounds we know of for languages with speculative evaluation. The main challenge is in parallelizing the necessary queuing operations on suspended threads.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Programming Languages and Systems", generalterms = "Languages; Performance; Theory", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783", keywords = "abstract machines; parallel languages; profiling semantics; speculation; threads", subject = "Software --- Software Engineering --- Metrics (D.2.8); Software --- Programming Languages --- Language Classifications (D.3.2): {\bf Data-flow languages}; Software --- Programming Languages --- Language Classifications (D.3.2); Theory of Computation --- Computation by Abstract Devices --- Modes of Computation (F.1.2): {\bf Parallelism and concurrency}; Theory of Computation --- Computation by Abstract Devices --- Modes of Computation (F.1.2); Theory of Computation --- Logics and Meanings of Programs --- Specifying and Verifying and Reasoning about Programs (F.3.1)", } @Article{Gu:1999:EJT, author = "Yan Gu and B. S. Lee and Wentong Cai", title = "Evaluation of {Java} thread performance on two different multithreaded kernels", journal = j-OPER-SYS-REV, volume = "33", number = "1", pages = "34--46", month = jan, year = "1999", CODEN = "OSRED8", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:37 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @Article{Harrington:1999:WMM, author = "John Harrington", title = "{Win32} Multithreading Made Easy", journal = j-CCCUJ, volume = "17", number = "8", pages = "48, 50--52, 54--56", month = aug, year = "1999", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Tue May 14 18:09:22 MDT 2002", bibsource = "http://www.cuj.com/articles/1999/9908/9908toc.htm?topic=articles; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Multithreading logic is hard to write and hard to maintain. So keep it simple and separate.", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @Article{Holub:1999:PJTa, author = "Allen Holub", title = "Programming {Java} threads in the real world, {Part} 5: Timers", journal = j-JAVAWORLD, volume = "4", number = "2", pages = "??--??", month = feb, year = "1999", CODEN = "????", ISSN = "1091-8906", bibdate = "Thu Mar 04 12:56:16 1999", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.holub.com/goodies/javaworld/jw_index.html", acknowledgement = ack-nhfb, } @Article{Holub:1999:PJTb, author = "Allen Holub", title = "Programming {Java} threads in the real world, {Part} 6: {Mach '99}: Observer and the Mysteries of the {AWTEventMulticaster}", journal = j-JAVAWORLD, volume = "4", number = "3", pages = "??--??", month = mar, year = "1999", CODEN = "????", ISSN = "1091-8906", bibdate = "Thu Mar 04 12:56:16 1999", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.holub.com/goodies/javaworld/jw_index.html", acknowledgement = ack-nhfb, } @Article{Jonsson:1999:NPS, author = "J. Jonsson and H. Loenn and K. G. Shin", title = "Non-preemptive Scheduling of Real-Time Threads on Multi-Level-Context Architectures", journal = j-LECT-NOTES-COMP-SCI, volume = "1586", pages = "363--??", year = "1999", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Mon Sep 13 16:57:02 MDT 1999", bibsource = "https://www.math.utah.edu/pub/tex/bib/lncs1999a.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Karamcheti:1999:ASM, author = "Vijay Karamcheti and Andrew A. Chien", title = "Architectural Support and Mechanisms for Object Caching in Dynamic Multithreaded Computations", journal = j-J-PAR-DIST-COMP, volume = "58", number = "2", pages = "260--300", month = aug, year = "1999", CODEN = "JPDCER", DOI = "https://doi.org/10.1006/jpdc.1999.1555", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:19:08 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1999.1555/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1999.1555/production/pdf; http://www.idealibrary.com/links/doi/10.1006/jpdc.1999.1555/production/ref", acknowledgement = ack-nhfb, fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315", } @Article{Kekckler:1999:CEH, author = "S. W. Kekckler and A. Chang and W. S. L. S. Chatterjee and W. J. Dally", title = "Concurrent event handling through multithreading", journal = j-IEEE-TRANS-COMPUT, volume = "48", number = "9", pages = "903--916", month = sep, year = "1999", CODEN = "ITCOB4", DOI = "https://doi.org/10.1109/12.795220", ISSN = "0018-9340 (print), 1557-9956 (electronic)", ISSN-L = "0018-9340", bibdate = "Wed Jul 6 08:46:59 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput1990.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=795220", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Computers", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12", } @Article{Krishnan:1999:CMA, author = "V. Krishnan and J. Torrellas", title = "A chip-multiprocessor architecture with speculative multithreading", journal = j-IEEE-TRANS-COMPUT, volume = "48", number = "9", pages = "866--880", month = sep, year = "1999", CODEN = "ITCOB4", DOI = "https://doi.org/10.1109/12.795218", ISSN = "0018-9340 (print), 1557-9956 (electronic)", ISSN-L = "0018-9340", bibdate = "Wed Jul 6 08:46:59 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput1990.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=795218", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Computers", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12", } @Article{Kusakabe:1999:INS, author = "S. Kusakabe and K. Inenaga and M. Amamiya and X. Tang", title = "Implementing a Non-strict Functional Programming Language on a Threaded Architecture", journal = j-LECT-NOTES-COMP-SCI, volume = "1586", pages = "138--??", year = "1999", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Mon Sep 13 16:57:02 MDT 1999", bibsource = "https://www.math.utah.edu/pub/tex/bib/lncs1999a.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Kwak:1999:EMC, author = "H. Kwak and B. Lee and A. R. Hurson and Suk-Han Yoon and Woo-Jong Hahn", title = "Effects of multithreading on cache performance", journal = j-IEEE-TRANS-COMPUT, volume = "48", number = "2", pages = "176--184", month = feb, year = "1999", CODEN = "ITCOB4", DOI = "https://doi.org/10.1109/12.752659", ISSN = "0018-9340 (print), 1557-9956 (electronic)", ISSN-L = "0018-9340", bibdate = "Wed Jul 6 08:46:56 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput1990.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=752659", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Computers", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12", } @Article{Lo:1999:SDR, author = "J. L. Lo and S. S. Parekh and S. J. Eggers and H. M. Levy and D. M. Tullsen", title = "Software-Directed Register Deallocation for Simultaneous Multithreaded Processors", journal = j-IEEE-TRANS-PAR-DIST-SYS, volume = "10", number = "9", pages = "922--??", month = sep, year = "1999", CODEN = "ITDSEO", ISSN = "1045-9219 (print), 1558-2183 (electronic)", ISSN-L = "1045-9219", bibdate = "Thu Oct 12 18:48:31 MDT 2000", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://dlib.computer.org/td/books/td1999/pdf/l0922.pdf; http://www.computer.org/tpds/td1999/l0922abs.htm", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Parallel and Distributed Systems", journal-URL = "http://www.computer.org/tpds/archives.htm", } @Article{Lo:1999:TCO, author = "Jack L. Lo and Susan J. Eggers and Henry M. Levy and Sujay S. Parekh and Dean M. Tullsen", title = "Tuning Compiler Optimizations for Simultaneous Multithreading", journal = j-INT-J-PARALLEL-PROG, volume = "27", number = "6", pages = "477--503", month = dec, year = "1999", CODEN = "IJPPE5", DOI = "https://doi.org/10.1023/A:1018780200739", ISSN = "0885-7458 (print), 1573-7640 (electronic)", ISSN-L = "0885-7458", bibdate = "Wed Jul 6 16:39:54 MDT 2005", bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=27&issue=6; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; OCLC Contents1st database", URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=27&issue=6&spage=477", acknowledgement = ack-nhfb, fjournal = "International Journal of Parallel Programming", journal-URL = "http://link.springer.com/journal/10766", remark = "Special Issue: {30th Annual ACM\slash IEEE International Symposium on Microarchitecture}, Part {II}.", } @Article{Lundberg:1999:PBS, author = "Lars Lundberg", title = "Predicting and Bounding the Speedup of Multithreaded {Solaris} Programs", journal = j-J-PAR-DIST-COMP, volume = "57", number = "3", pages = "322--333", month = jun, year = "1999", CODEN = "JPDCER", DOI = "https://doi.org/10.1006/jpdc.1999.1536", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:19:07 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1999.1536/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1999.1536/production/pdf; http://www.idealibrary.com/links/doi/10.1006/jpdc.1999.1536/production/ref", acknowledgement = ack-nhfb, fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315", } @Article{Manley:1999:IPT, author = "Kevin Manley", title = "Improving Performance with Thread-Private Heaps", journal = j-CCCUJ, volume = "17", number = "9", pages = "50--??", month = sep, year = "1999", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Tue May 14 18:09:22 MDT 2002", bibsource = "http://www.cuj.com/articles/1999/9909/9909toc.htm?topic=articles; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Threads interact in the darndest ways, but conflicts with a common heap are particularly pernicious. Luckily they can be avoided.", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @Article{Marcuello:1999:EST, author = "P. Marcuello and A. Gonzalez", title = "Exploiting Speculative Thread-Level Parallelism on a {SMT} Processor", journal = j-LECT-NOTES-COMP-SCI, volume = "1593", pages = "754--??", year = "1999", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Mon Sep 13 16:57:02 MDT 1999", bibsource = "https://www.math.utah.edu/pub/tex/bib/lncs1999a.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Masney:1999:IMT, author = "Brian Masney", title = "Introduction to Multi-Threaded Programming", journal = j-LINUX-J, volume = "61", pages = "??--??", month = may, year = "1999", CODEN = "LIJOFX", ISSN = "1075-3583 (print), 1938-3827 (electronic)", ISSN-L = "1075-3583", bibdate = "Thu Jun 3 06:34:02 MDT 1999", bibsource = "http://www.linuxjournal.com/issue61/index.html; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "A description of thread programming basics.", acknowledgement = ack-nhfb, fjournal = "Linux journal", journal-URL = "http://portal.acm.org/citation.cfm?id=J508", } @Article{Mendelson:1999:DAM, author = "Avi Mendelson and Michael Bekerman", title = "Design Alternatives of Multithreaded Architecture", journal = j-INT-J-PARALLEL-PROG, volume = "27", number = "3", pages = "161--193", month = jun, year = "1999", CODEN = "IJPPE5", DOI = "https://doi.org/10.1023/A:1018733528538", ISSN = "0885-7458 (print), 1573-7640 (electronic)", ISSN-L = "0885-7458", bibdate = "Wed Jul 6 16:39:53 MDT 2005", bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=27&issue=3; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; OCLC Contents1st database", URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=27&issue=3&spage=161", acknowledgement = ack-nhfb, fjournal = "International Journal of Parallel Programming", journal-URL = "http://link.springer.com/journal/10766", } @InProceedings{Mitchell:1999:ILP, author = "Nicholas Mitchell and Larry Carter and Jeanne Ferrante and Dean Tullsen", title = "Instruction-level Parallelism vs. Thread-level Parallelism on Simultaneous Multi-threading Processors", crossref = "ACM:1999:SPO", pages = "??--??", year = "1999", bibdate = "Thu Feb 24 09:02:57 2000", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.sc99.org/techpapers/", acknowledgement = ack-nhfb, } @Article{Moody:1999:STT, author = "Scott Arthur Moody and Samuel Kwok and Dale Karr", title = "{SimpleGraphics}: {Tcl\slash Tk} visualization of real-time multi-threaded and distributed applications", journal = j-SIGADA-LETTERS, volume = "19", number = "2", pages = "60--66", month = jun, year = "1999", CODEN = "AALEE5", ISSN = "1094-3641 (print), 1557-9476 (electronic)", ISSN-L = "1094-3641", bibdate = "Sat Aug 9 09:06:06 MDT 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGAda Ada Letters", } @Article{Narlikar:1999:SES, author = "Girija J. Narlikar and Guy E. Blelloch", title = "Space-Efficient Scheduling of Nested Parallelism", journal = j-TOPLAS, volume = "21", number = "1", pages = "138--173", month = jan, year = "1999", CODEN = "ATPSDT", ISSN = "0164-0925 (print), 1558-4593 (electronic)", ISSN-L = "0164-0925", bibdate = "Tue Sep 26 10:12:58 MDT 2000", bibsource = "http://www.acm.org/pubs/contents/journals/toplas/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org/pubs/citations/journals/toplas/1999-21-1/p138-narlikar/", abstract = "Many of today's high-level parallel languages support dynamic, fine-grained parallelism. These languages allow the user to expose all the parallelism in the program, which is typically of a much higher degree than the number of processors. Hence an efficient scheduling algorithm is required to assign computations to processors at runtime. Besides having low overheads and good load balancing, it is important for the scheduling algorithm to minimize the space usage of the parallel program. This article presents an on-line scheduling algorithm that is provably space efficient and time efficient for nested-parallel languages. For a computation with depth $D$ and serial space requirement $ S_1 $, the algorithm generates a schedule that requires at most $ S_1 + O(K \cdot D \cdot p) $ space (including scheduler space) on $p$ processors. Here, $K$ is a user-adjustable runtime parameter specifying the net amount of memory that a thread may allocate before it is preempted by the scheduler. Adjusting the value of $K$ provides a trade-off between the running time and the memory requirement of a parallel computation. To allow the scheduler to scale with the number of processors we also parallelize the scheduler and analyze the space and time bounds of the computation to include scheduling costs. In addition to showing that the scheduling algorithm is space and time efficient in theory, we demonstrate that it is effective in practice. We have implemented a runtime system that uses our algorithm to schedule lightweight parallel threads. The results of executing parallel programs on this system show that our scheduling algorithm significantly reduces memory usage compared to previous techniques, without compromising performance.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Programming Languages and Systems", generalterms = "Algorithms; Languages; Performance", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783", keywords = "dynamic scheduling; multithreading; nested parallelism; parallel language implementation; space efficiency", subject = "Software --- Programming Techniques --- Concurrent Programming (D.1.3): {\bf Parallel programming}; Software --- Programming Languages --- Processors (D.3.4): {\bf Run-time environments}; Theory of Computation --- Analysis of Algorithms and Problem Complexity --- General (F.2.0)", } @Article{Nemeth:1999:MLK, author = "Z. Nemeth and H. Tomiyasu and P. Kacsuk and M. Amamiya", title = "Multithreaded {LOGFLOW} on {KUMP\slash} {D}", journal = j-LECT-NOTES-COMP-SCI, volume = "1615", pages = "320--??", year = "1999", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Mon Sep 13 16:57:02 MDT 1999", bibsource = "https://www.math.utah.edu/pub/tex/bib/lncs1999b.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Nevison:1999:SSC, author = "Christopher H. Nevison", title = "Seminar: safe concurrent programming in {Java} with {CSP}", journal = j-SIGCSE, volume = "31", number = "1", pages = "367", month = mar, year = "1999", CODEN = "SIGSD3", DOI = "https://doi.org/10.1145/384266.299817", ISSN = "0097-8418 (print), 2331-3927 (electronic)", ISSN-L = "0097-8418", bibdate = "Sat Nov 17 16:56:36 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigcse1990.bib", abstract = "We present methods for safe and correct programming for concurrent threads in Java. The methods are based on the principles of Concurrent Sequential Processes (CSP). We demonstrate the use of tools which provide the structure of CSP within Java to avoid some of the pitfalls of multithreaded programming using monitors, the primitive synchronization tool in Java. Several examples illustrate the use of these tools.", acknowledgement = ack-nhfb, fjournal = "SIGCSE Bulletin (ACM Special Interest Group on Computer Science Education)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J688", } @Book{Oaks:1999:JT, author = "Scott Oaks and Henry Wong", title = "{Java} threads", publisher = pub-ORA, address = pub-ORA:adr, edition = "Second", pages = "xiii + 319", year = "1999", ISBN = "1-56592-418-5", ISBN-13 = "978-1-56592-418-5", LCCN = "QA76.73.J38 O25 1999", bibdate = "Fri May 10 12:18:17 MDT 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = "Java series", acknowledgement = ack-nhfb, keywords = "Java (computer program language); threads (computer programs)", } @Article{Pant:1999:TCP, author = "Lalit Pant", title = "Thread Communication In Parallel Algorithms: Enabling efficient interaction between threads", journal = j-DDJ, volume = "24", number = "4", pages = "32, 34, 36, 38--39", month = apr, year = "1999", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Wed Mar 3 06:30:11 MST 1999", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.ddj.com/ftp/1999/1999_04/parallel.txt", abstract = "With the increasing availability of multiprocessing hardware, thread-based parallel algorithms are becoming more and more important. Lalit presents thread communication mechanisms for use within parallel algorithms. Additional resources include parallel.txt (listings).", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @Book{Pham:1999:MPW, author = "Thuan Q. Pham and Pankaj K. Garg", title = "Multithreaded Programming with {Win32}", publisher = pub-PHPTR, address = pub-PHPTR:adr, pages = "xix + 219", year = "1999", ISBN = "0-13-010912-6", ISBN-13 = "978-0-13-010912-5", LCCN = "QA76.642.P518 1998", bibdate = "Thu Jan 21 18:58:23 1999", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "Includes CD-ROM.", URL = "http://www.phptr.com/ptrbooks/ptr_0130109126.html", acknowledgement = ack-nhfb, publishersnote = "If you want to deliver NT applications with maximum performance, efficiency and robustness, you need to master multithreading. Multithreaded Programming with Win32 brings together every Win32 multithreading technique and concept you must know --- all brilliantly explained with practical examples and sample code.", xxnote = "Check pages and year??", } @Article{Plauger:1999:SCCg, author = "P. J. Plauger", title = "{Standard C/C++}: a Better Red-Black Tree", journal = j-CCCUJ, volume = "17", number = "7", pages = "10--??", month = jul, year = "1999", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Tue May 14 18:09:21 MDT 2002", bibsource = "http://www.cuj.com/articles/1999/9907/9907toc.htm?topic=articles; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The C++ Standard is silent about issues such as thread safety and DLL safety, but customers and reviewers certainly aren't.", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @Article{Richards:1999:ALT, author = "Etienne Richards", title = "Adding Level-2 Thread Safety to Existing Objects", journal = j-CCCUJ, volume = "17", number = "2", pages = "??--??", month = feb, year = "1999", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Tue May 14 18:09:19 MDT 2002", bibsource = "http://www.cuj.com/articles/1999/9902/9902toc.htm?topic=articles; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The code required to share an object among multiple threads is tedious and error prone. But it can be neatly encapsulated.", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @Article{Ringle:1999:SCT, author = "Jonathan Ringle", title = "Singleton Creation the Thread-safe Way", journal = j-CCCUJ, volume = "17", number = "10", pages = "??--??", month = oct, year = "1999", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Tue May 14 18:09:23 MDT 2002", bibsource = "http://www.cuj.com/articles/1999/9910/9910toc.htm?topic=articles; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Singletons avoid problems with order of construction, at the cost of more problems for multithreading.", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @Article{Rodgers:1999:TSN, author = "Jeremy B. Rodgers and Rhonda Kay Gaede and Jeffrey H. Kulick", title = "{IN-Tune}: an {In-Situ} non-invasive performance tuning tool for multi-threaded {Linux} on symmetric multiprocessing {Pentium} workstations", journal = j-SPE, volume = "29", number = "9", pages = "775--792", day = "25", month = jul, year = "1999", CODEN = "SPEXBL", ISSN = "0038-0644 (print), 1097-024X (electronic)", ISSN-L = "0038-0644", bibdate = "Thu Jul 29 15:12:27 MDT 1999", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www3.interscience.wiley.com/journalfinder.html", URL = "http://www3.interscience.wiley.com/cgi-bin/abstract?ID=62501865; http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=62501865&PLACEBO=IE.pdf", acknowledgement = ack-nhfb, fjournal = "Software---Practice and Experience", journal-URL = "http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1097-024X", } @TechReport{Roe:1999:PMI, author = "Kevin Roe and Piyush Mehrotra", title = "Parallelization of a multigrid incompressible viscous cavity flow solver using {openMP}", type = "{NASA} contractor report", number = "NASA\slash CR-1999-209551", institution = inst-NLRC, address = inst-NLRC:adr, pages = "????", year = "1999", bibdate = "Thu Mar 16 07:20:02 2000", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "Also ICASE report 99-36.", acknowledgement = ack-nhfb, } @Article{Ronsse:1999:RFI, author = "Michiel Ronsse and Koen {De Bosschere}", title = "{RecPlay}: a fully integrated practical record\slash replay system", journal = j-TOCS, volume = "17", number = "2", pages = "133--152", month = may, year = "1999", CODEN = "ACSYEC", ISSN = "0734-2071 (print), 1557-7333 (electronic)", ISSN-L = "0734-2071", bibdate = "Tue Sep 26 07:54:31 MDT 2000", bibsource = "http://www.acm.org/pubs/contents/journals/tocs/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org/pubs/citations/journals/tocs/1999-17-2/p133-ronsse/", abstract = "This article presents a practical solution for the cyclic debugging of nondeterministic parallel programs. The solution consists of a combination of record\slash replay with automatic on-the-fly data race detection. This combination enables us to limit the record phase to the more efficient recording of the synchronization operations, while deferring the time-consuming data race detection to the replay phase. As the record phase is highly efficient, there is no need to switch it off, hereby eliminating the possibility of Heisenbugs because tracing can be left on all the time. This article describes an implementation of the tools needed to support RecPlay.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Computer Systems", generalterms = "Algorithms; Experimentation; Reliability", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J774", keywords = "binary code modification; multithreaded programming; race detection", subject = "Software --- Programming Techniques --- Concurrent Programming (D.1.3): {\bf Parallel programming}; Software --- Software Engineering --- Testing and Debugging (D.2.5): {\bf Debugging aids}; Software --- Software Engineering --- Testing and Debugging (D.2.5): {\bf Monitors}; Software --- Software Engineering --- Testing and Debugging (D.2.5): {\bf Tracing}; Software --- Operating Systems --- Process Management (D.4.1): {\bf Concurrency}; Software --- Operating Systems --- Process Management (D.4.1): {\bf Deadlocks}; Software --- Operating Systems --- Process Management (D.4.1): {\bf Multiprocessing/multiprogramming/multitasking}; Software --- Operating Systems --- Process Management (D.4.1): {\bf Mutual exclusion}; Software --- Operating Systems --- Process Management (D.4.1): {\bf Synchronization}", } @Article{Rugina:1999:PAM, author = "Radu Rugina and Martin Rinard", title = "Pointer Analysis for Multithreaded Programs", journal = j-SIGPLAN, volume = "34", number = "5", pages = "77--90", month = may, year = "1999", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:18:03 MST 2003", bibsource = "http://www.acm.org/pubs/contents/proceedings/pldi/301122/index.html; http://www.acm.org/pubs/contents/proceedings/pldi/301618/index.html; http://www.cs.rutgers.edu/pldi99/program.html; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "See PLDI'99 proceedings \cite{ACM:1999:PASa}.", URL = "http://www.acm.org:80/pubs/citations/proceedings/pldi/301122/p77-rugina/", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Saito:1999:MRS, author = "H. Saito and N. Stavrakos and C. Polychronopoulos", title = "Multithreading Runtime Support for Loop and Functional Parallelism", journal = j-LECT-NOTES-COMP-SCI, volume = "1615", pages = "133--??", year = "1999", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Mon Sep 13 16:57:02 MDT 1999", bibsource = "https://www.math.utah.edu/pub/tex/bib/lncs1999b.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @MastersThesis{Samorodin:1999:SFS, author = "Steven Howard Samorodin", title = "Supporting flexible safety and sharing in multi-threaded environments", type = "Thesis ({M.S.})", school = "Computer Science Department, University of California, Davis", address = "Davis, CA, USA", pages = "39", year = "1999", bibdate = "Sat Apr 20 11:17:26 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Scherer:1999:TAP, author = "Alex Scherer and Honghui Lu and Thomas Gross and Willy Zwaenepoel", title = "Transparent adaptive parallelism on {NOWs} using {OpenMP}", journal = j-SIGPLAN, volume = "34", number = "8", pages = "96--106", month = aug, year = "1999", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:18:06 MST 2003", bibsource = "http://www.acm.org/pubs/contents/proceedings/ppopp/301104/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org/pubs/citations/proceedings/ppopp/301104/p96-scherer/", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Seiden:1999:ROM, author = "Steven S. Seiden", title = "Randomized Online Multi-Threaded Paging", journal = j-NORDIC-J-COMPUT, volume = "6", number = "2", pages = "148--??", month = "Summer", year = "1999", CODEN = "NJCOFR", ISSN = "1236-6064", bibdate = "Fri Oct 13 05:25:14 MDT 2000", bibsource = "http://www.cs.helsinki.fi/njc/njc6.html; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.cs.helsinki.fi/njc/References/seiden1999:148.html", acknowledgement = ack-nhfb, fjournal = "Nordic Journal of Computing", } @InProceedings{Shen:1999:ATL, author = "Kai Shen and Hong Tang and Tao Yang", title = "Adaptive Two-level Thread Management for Fast {MPI} Execution on Shared Memory Machines", crossref = "ACM:1999:SPO", pages = "??--??", year = "1999", bibdate = "Thu Feb 24 09:02:57 2000", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.sc99.org/techpapers/", acknowledgement = ack-nhfb, } @Article{Sinharoy:1999:COI, author = "Balaram Sinharoy", title = "Compiler optimization to improve data locality for processor multithreading", journal = j-SCI-PROG, volume = "7", number = "1", pages = "21--37", month = "????", year = "1999", CODEN = "SCIPEV", ISSN = "1058-9244 (print), 1875-919X (electronic)", ISSN-L = "1058-9244", bibdate = "Thu Mar 28 12:27:27 MST 2002", bibsource = "Compendex database; http://www.iospress.nl/site/html/10589244.html; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; OCLC Article1st database", URL = "http://iospress.metapress.com/app/home/contribution.asp%3Fwasp=64cr5a4mg33tuhcbdr02%26referrer=parent%26backto=issue%2C2%2C7%3Bjournal%2C8%2C9%3Blinkingpublicationresults%2C1%2C1", acknowledgement = ack-nhfb, fjournal = "Scientific Programming", journal-URL = "http://iospress.metapress.com/content/1058-9244", } @InProceedings{Storino:1999:MTB, author = "Salvatore Storino and John M. Borkenhagen and Ronald N. Kalla and Steven R. Kunkel", title = "A Multi-Threaded 64-bit {PowerPC} Commercial {RISC} Processor Design", crossref = "IEEE:1999:HCS", pages = "??--??", year = "1999", bibdate = "Mon Jan 08 05:28:04 2001", bibsource = "ftp://www.hotchips.org//pub/hotc7to11cd/hc99/hc11_pdf/hc99.s1.1.Storino.txt; http://www.hotchips.org/hotc11_monday.html; https://www.math.utah.edu/pub/tex/bib/hot-chips.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Sutter:1999:OAM, author = "Herb Sutter", title = "Optimizations That Aren't (In a Multithreaded World)", journal = j-CCCUJ, volume = "17", number = "6", pages = "??--??", month = jun, year = "1999", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Tue May 14 18:09:21 MDT 2002", bibsource = "http://www.cuj.com/articles/1999/9906/9906toc.htm?topic=articles; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "An ``obvious'' optimization can really lose ground when thread safety has to be ensured as well.", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @InProceedings{Tan:1999:OFN, author = "Kian-Lee Tan and Cheng Hian Goh and Beng Chin Ooi", title = "Online Feedback for Nested Aggregate Queries with Multi-Threading", crossref = "Atkinson:1999:PTF", pages = "18--29", year = "1999", bibdate = "Fri Jan 12 07:50:37 MST 2001", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/vldb.bib; http://www.vldb.org/dblp/db/conf/vldb/vldb99.html; OCLC Proceedings database", URL = "http://www.vldb.org/dblp/db/conf/vldb/TanGO99.html", acknowledgement = ack-nhfb, authorurl = "http://www.vldb.org/dblp/db/indices/a-tree/t/Tan:Kian=Lee.html; http://www.vldb.org/dblp/db/indices/a-tree/g/Goh:Cheng_Hian.html; http://www.vldb.org/dblp/db/indices/a-tree/o/Ooi:Beng_Chin.html", } @Article{Tang:1999:APT, author = "Xinan Tang and Guang R. Gao", title = "Automatically Partitioning Threads for Multithreaded Architectures", journal = j-J-PAR-DIST-COMP, volume = "58", number = "2", pages = "159--189", month = aug, year = "1999", CODEN = "JPDCER", DOI = "https://doi.org/10.1006/jpdc.1999.1551", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:19:08 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1999.1551/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1999.1551/production/pdf; http://www.idealibrary.com/links/doi/10.1006/jpdc.1999.1551/production/ref", acknowledgement = ack-nhfb, fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315", } @Article{Tang:1999:CRT, author = "Hong Tang and Kai Shen and Tao Yang", title = "Compile\slash run-time support for threaded {MPI} execution on multiprogrammed shared memory machines", journal = j-SIGPLAN, volume = "34", number = "8", pages = "107--118", month = aug, year = "1999", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:18:06 MST 2003", bibsource = "http://www.acm.org/pubs/contents/proceedings/ppopp/301104/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org/pubs/citations/proceedings/ppopp/301104/p107-tang/", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Taura:1999:SMI, author = "Kenjiro Taura and Kunio Tabata and Akinori Yonezawa", title = "{StackThreads\slash MP}: integrating futures into calling standards", journal = j-SIGPLAN, volume = "34", number = "8", pages = "60--71", month = aug, year = "1999", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:18:06 MST 2003", bibsource = "http://www.acm.org/pubs/contents/proceedings/ppopp/301104/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org/pubs/citations/proceedings/ppopp/301104/p60-taura/", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Throop:1999:SOS, author = "Joe Throop", title = "Standards: {OpenMP}: Shared-Memory Parallelism from the Ashes", journal = j-COMPUTER, volume = "32", number = "5", pages = "108--109", month = may, year = "1999", CODEN = "CPTRB4", ISSN = "0018-9162 (print), 1558-0814 (electronic)", ISSN-L = "0018-9162", bibdate = "Thu May 6 06:17:23 MDT 1999", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://dlib.computer.org/co/books/co1999/pdf/r5108.pdf", acknowledgement = ack-nhfb, fjournal = "Computer", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2", } @Article{Torrant:1999:SMS, author = "Marc Torrant and Muhammad Shaaban and Roy Czernikowski and Ken Hsu", title = "A simultaneous multithreading simulator", journal = j-COMP-ARCH-NEWS, volume = "27", number = "5", pages = "1--5", month = dec, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Vlassov:1999:QMM, author = "V. Vlassov and A. Kraynikov", title = "A Queuing Model of a Multi-threaded Architecture: a Case Study", journal = j-LECT-NOTES-COMP-SCI, volume = "1662", pages = "306--??", year = "1999", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Mon Sep 13 16:57:02 MDT 1999", bibsource = "https://www.math.utah.edu/pub/tex/bib/lncs1999b.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Weissman:1999:HPT, author = "B. Weissman and B. Gomes", title = "High Performance Thread Migration on Clusters of {SMPs}", journal = j-PARALLEL-DIST-COMP-PRACT, volume = "2", number = "2", pages = "??--??", month = "????", year = "1999", CODEN = "????", ISSN = "1097-2803", bibdate = "Fri Dec 19 08:14:13 MST 2003", bibsource = "http://www.cs.okstate.edu/~pdcp/vols/vol02/vol02no2.html; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.cs.okstate.edu/~pdcp/vols/vol02/vol02no2abs.html#boris", acknowledgement = ack-nhfb, fjournal = "PDCP: Parallel and Distributed Computing Practices", } @Article{Wu:1999:GMC, author = "C.-C. Wu and C. Chen", title = "Grouping Memory Consistency Model for Parallel-Multithreaded Shared-Memory Multiprocessor Systems", journal = j-INT-J-HIGH-SPEED-COMPUTING, volume = "10", number = "1", pages = "53--82", month = mar, year = "1999", CODEN = "IHSCEZ", ISSN = "0129-0533", bibdate = "Mon Feb 25 11:19:21 MST 2002", bibsource = "http://ejournals.wspc.com.sg/ijhsc/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; OCLC Article1st database", acknowledgement = ack-nhfb, fjournal = "International Journal of High Speed Computing (IJHSC)", } @Article{Xu:1999:DIT, author = "Zhichen Xu and Barton P. Miller and Oscar Naim", title = "Dynamic instrumentation of threaded applications", journal = j-SIGPLAN, volume = "34", number = "8", pages = "49--59", month = aug, year = "1999", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:18:06 MST 2003", bibsource = "http://www.acm.org/pubs/contents/proceedings/ppopp/301104/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org/pubs/citations/proceedings/ppopp/301104/p49-xu/", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @TechReport{Presotto:19xx:MSP, author = "David Leo Presotto", title = "Multiprocessor Streams for {Plan 9}", type = "Computing Science Technical Report", number = "158e", institution = inst-ATT-BELL, address = inst-ATT-BELL:adr, pages = "10", day = "??", month = "????", year = "19xx", bibdate = "Fri Aug 25 15:53:20 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/plan9.bib; https://www.math.utah.edu/pub/tex/bib/unix.bib", abstract = "This paper describes an implementation of Streams for the Plan 9 kernel, a multi-threaded, multiprocessor kernel with a system call interface reminiscent of UNIX. Rather than port Dennis Ritchie's Streams to Plan 9, we changed the abstraction to fit more naturally into the new environment. The result is a mechanism that has similar performance and is internally easier to program", acknowledgement = ack-nhfb, remark = "Undated and unnumbered. Number taken from filename.", } @Article{Akkary:2000:CSM, author = "Haitham Akkary and S{\'e}bastien Hily", title = "The Case for Speculative Multithreading on {SMT} Processors", journal = j-LECT-NOTES-COMP-SCI, volume = "1940", pages = "59--??", year = "2000", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Fri Feb 1 09:17:15 MST 2002", bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t1940.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer-ny.com/link/service/series/0558/bibs/1940/19400059.htm; http://link.springer-ny.com/link/service/series/0558/papers/1940/19400059.pdf", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Anonymous:2000:NPAa, author = "Anonymous", title = "New Products: {AVP for Linux/FreeBSD UNIX, Kaspersky Lab Ltd.; API PowerRAC Chassis 320, Alpha Processor Inc.; ODBC-ODBC Bridge, Easysoft Ltd.; LinkScan 6.1, Electronic Software Publishing Corporation; Metro-X Enhanced Server CD, Metro Link, Inc.; P-STAT Statistical Software, P-STAT, Inc.; System Manager in a Box v1.0, PegaSoft Canada; PGI Workstation 3.1, PGI; Quick Restore 2.6, Workstation Solutions, Inc.; Threads.h++ and Tools.h++ Professional, Rogue Wave Software; Scriptics Connect 1.0, 1.1, Scriptics Corporation; TapeWare 6.2 Backup Software, Yosemite Technologies, Inc.; DoubleVision for Linux Systems, Tridia Corporation}", journal = j-LINUX-J, volume = "71", pages = "??--??", month = mar, year = "2000", CODEN = "LIJOFX", ISSN = "1075-3583 (print), 1938-3827 (electronic)", ISSN-L = "1075-3583", bibdate = "Thu Sep 21 07:44:12 MDT 2000", bibsource = "http://noframes.linuxjournal.com/lj-issues/issue71/index.html; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Linux journal", journal-URL = "http://portal.acm.org/citation.cfm?id=J508", } @Article{Anonymous:2000:SLT, author = "Anonymous", title = "Strictly On-Line: {T/TCP: TCP for Transactions by Mark Stacey, Ivan Griffin and John Nelson; POSIX Thread Libraries by Felix Garcia and Javier Fernandez; Linux and Open-Source Applications by Peter Jones and M. B. Jorgenson; Laptops for Linux! by Jason Kroll}", journal = j-LINUX-J, volume = "70", pages = "??--??", month = feb, year = "2000", CODEN = "LIJOFX", ISSN = "1075-3583 (print), 1938-3827 (electronic)", ISSN-L = "1075-3583", bibdate = "Thu Sep 21 16:32:31 MDT 2000", bibsource = "http://noframes.linuxjournal.com/lj-issues/issue70/index.html; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://noframes.linuxjournal.com/lj-issues/issue70/3075.html; http://noframes.linuxjournal.com/lj-issues/issue70/3184.html; http://noframes.linuxjournal.com/lj-issues/issue70/3683.html; http://noframes.linuxjournal.com/lj-issues/issue70/3766.html", acknowledgement = ack-nhfb, fjournal = "Linux journal", journal-URL = "http://portal.acm.org/citation.cfm?id=J508", } @Article{Antoniu:2000:CDP, author = "G. Antoniu and L. Boug{\'e} and R. Namyst and C. P{\'e}rez", title = "Compiling Data-Parallel Programs to a Distributed Runtime Environment with Thread Isomigration", journal = j-PARALLEL-PROCESS-LETT, volume = "10", number = "2/3", pages = "201--??", month = sep, year = "2000", CODEN = "PPLTEE", ISSN = "0129-6264 (print), 1793-642X (electronic)", bibdate = "Wed Apr 18 07:29:37 2001", bibsource = "http://ejournals.wspc.com.sg/ppl/10/1002_03/S01296264001002_03.html; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://ejournals.wspc.com.sg/ppl/10/1002_03/S0129626400000202.html", acknowledgement = ack-nhfb, fjournal = "Parallel Processing Letters", journal-URL = "http://www.worldscientific.com/loi/ppl", } @Article{Antoniu:2000:IJC, author = "Gabriel Antoniu and Luc Boug{\'e} and Philip Hatcher and Mark MacBeth and Keith McGuigan and Raymond Namyst", title = "Implementing {Java} Consistency Using a Generic, Multithreaded {DSM} Runtime System", journal = j-LECT-NOTES-COMP-SCI, volume = "1800", pages = "560--??", year = "2000", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Fri Feb 1 09:16:18 MST 2002", bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t1800.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer-ny.com/link/service/series/0558/bibs/1800/18000560.htm; http://link.springer-ny.com/link/service/series/0558/papers/1800/18000560.pdf", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Aumage:2000:PAM, author = "Olivier Aumage and Luc Boug{\'e} and Raymond Namyst", title = "A Portable and Adaptative Multi-protocol Communication Library for Multithreaded Runtime Systems", journal = j-LECT-NOTES-COMP-SCI, volume = "1800", pages = "1136--??", year = "2000", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Fri Feb 1 09:16:18 MST 2002", bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t1800.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer-ny.com/link/service/series/0558/bibs/1800/18001136.htm; http://link.springer-ny.com/link/service/series/0558/papers/1800/18001136.pdf", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Becker:2000:JSU, author = "Pete Becker", title = "The Journeyman's Shop: Unraveling Multithreading", journal = j-CCCUJ, volume = "18", number = "8", pages = "71--??", month = aug, year = "2000", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Tue May 14 18:09:27 MDT 2002", bibsource = "http://www.cuj.com/articles/2000/0008/0008toc.htm?topic=articles; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Sometimes you have to spend a lot of time on just a little bit of code, to avoid spending much more time not knowing where to begin debugging.", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @Article{Bedy:2000:VSM, author = "Michael Bedy and Steve Carr and Xianlong Huang and Ching-Kuang Shene", title = "A visualization system for multithreaded programming", journal = j-SIGCSE, volume = "32", number = "1", pages = "1--5", month = mar, year = "2000", CODEN = "SIGSD3", DOI = "https://doi.org/10.1145/331795.331798", ISSN = "0097-8418 (print), 2331-3927 (electronic)", ISSN-L = "0097-8418", bibdate = "Mon Nov 19 10:05:03 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigcse2000.bib", acknowledgement = ack-nhfb, fjournal = "SIGCSE Bulletin (ACM Special Interest Group on Computer Science Education)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J688", } @Article{Berger:2000:HSMa, author = "Emery D. Berger and Kathryn S. McKinley and Robert D. Blumofe and Paul R. Wilson", title = "{Hoard}: a scalable memory allocator for multithreaded applications", journal = j-COMP-ARCH-NEWS, volume = "28", number = "5", pages = "117--128", month = dec, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Berger:2000:HSMb, author = "Emery D. Berger and Kathryn S. McKinley and Robert D. Blumofe and Paul R. Wilson", title = "{Hoard}: a Scalable Memory Allocator for Multithreaded Applications", journal = j-SIGPLAN, volume = "35", number = "11", pages = "117--128", month = nov, year = "2000", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:18:19 MST 2003", bibsource = "http://foothill.lcs.mit.edu/asplos2k/program.html; http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Berger:2000:HSMc, author = "Emery D. Berger and Kathryn S. McKinley and Robert D. Blumofe and Paul R. Wilson", title = "{Hoard}: a scalable memory allocator for multithreaded applications", journal = j-OPER-SYS-REV, volume = "34", number = "5", pages = "117--128", month = dec, year = "2000", CODEN = "OSRED8", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGOPS Operating Systems Review", } @Article{Beyls:2000:CGM, author = "K. E. Beyls and E. H. D'Hollander", title = "Compiler Generated Multithreading to Alleviate Memory Latency", journal = j-J-UCS, volume = "6", number = "10", pages = "968--993", day = "28", month = oct, year = "2000", CODEN = "????", ISSN = "0948-695X (print), 0948-6968 (electronic)", ISSN-L = "0948-6968", bibdate = "Wed Feb 20 07:23:07 MST 2002", bibsource = "http://www.jucs.org/jucs; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.jucs.org/jucs_6_10/compiler_generated_multithreading_to", acknowledgement = ack-nhfb, fjournal = "J.UCS: Journal of Universal Computer Science", journal-URL = "http://www.jucs.org/jucs", } @Article{Bhandarkar:2000:PPM, author = "Suchendra M. Bhandarkar and Shankar R. Chandrasekaran", title = "Parallel Parsing of {MPEG} Video in a Multi-threaded Multiprocessor Environment", journal = j-LECT-NOTES-COMP-SCI, volume = "1800", pages = "194--??", year = "2000", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Fri Feb 1 09:16:18 MST 2002", bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t1800.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer-ny.com/link/service/series/0558/bibs/1800/18000194.htm; http://link.springer-ny.com/link/service/series/0558/papers/1800/18000194.pdf", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Bolding:2000:MSM, author = "Barry Bolding and Kim Baldridge", title = "Multithreaded shared memory parallel implementation of the electronic structure code {GAMESS}", journal = j-COMP-PHYS-COMM, volume = "128", number = "1--2", pages = "55--66", day = "9", month = jun, year = "2000", CODEN = "CPHCBZ", DOI = "https://doi.org/10.1016/S0010-4655(00)00067-9", ISSN = "0010-4655 (print), 1879-2944 (electronic)", ISSN-L = "0010-4655", bibdate = "Mon Feb 13 23:40:43 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/compphyscomm2000.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.sciencedirect.com/science/article/pii/S0010465500000679", acknowledgement = ack-nhfb, fjournal = "Computer Physics Communications", journal-URL = "http://www.sciencedirect.com/science/journal/00104655", } @Article{Borkenhagen:2000:MPP, author = "J. M. Borkenhagen and R. J. Eickemeyer and R. N. Kalla and S. R. Kunkel", title = "A multithreaded {PowerPC} processor for commercial servers", journal = j-IBM-JRD, volume = "44", number = "6", pages = "885--898", month = nov, year = "2000", CODEN = "IBMJAE", ISSN = "0018-8646 (print), 2151-8556 (electronic)", ISSN-L = "0018-8646", bibdate = "Sat Feb 24 09:44:45 MST 2001", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.research.ibm.com/journal/", URL = "http://www.research.ibm.com/journal/rd/446/borkenhagen.html", acknowledgement = ack-nhfb, fjournal = "IBM Journal of Research and Development", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5288520", ordernumber = "G322-0224", } @Article{Boussinot:2000:JTS, author = "Fr{\'e}d{\'e}ric Boussinot and Jean-Ferdy Susini", title = "{Java} threads and {SugarCubes}", journal = j-SPE, volume = "30", number = "5", pages = "545--566", day = "25", month = apr, year = "2000", CODEN = "SPEXBL", DOI = "https://doi.org/10.1002/(SICI)1097-024X(20000425)30:5<545::AID-SPE308>3.0.CO;2-Q", ISSN = "0038-0644 (print), 1097-024X (electronic)", ISSN-L = "0038-0644", bibdate = "Tue Mar 13 06:45:44 2001", bibsource = "http://www.interscience.wiley.com/jpages/0038-0644; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www3.interscience.wiley.com/journalfinder.html", URL = "http://www3.interscience.wiley.com/cgi-bin/abstract/71004433/START; http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=71004433&PLACEBO=IE.pdf", acknowledgement = ack-nhfb, fjournal = "Software---Practice and Experience", journal-URL = "http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1097-024X", } @Article{Bova:2000:DLP, author = "Steve W. Bova and Clay P. Breshears and Christine E. Cuicchi and Zeki Demirbilek and Henry A. Gabb", title = "Dual-Level Parallel Analysis of Harbor Wave Response Using {MPI} and {OpenMP}", journal = j-IJHPCA, volume = "14", number = "1", pages = "49--64", month = "Spring", year = "2000", CODEN = "IHPCFL", ISSN = "1094-3420 (print), 1741-2846 (electronic)", ISSN-L = "1094-3420", bibdate = "Tue Sep 12 12:39:11 2000", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, journal-URL = "http://hpc.sagepub.com/content/by/year", } @InCollection{Cahir:2000:PMM, author = "Margaret Cahir and Robert Moench and Alice E. Koniges", title = "Programming Models and Methods", crossref = "Koniges:2000:ISP", chapter = "3", pages = "27--54", year = "2000", bibdate = "Fri Feb 04 18:32:51 2000", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "Discusses PVM, MPI, SHMEM, High-Performance Fortran, and POSIX threads.", acknowledgement = ack-nhfb, } @Article{Cahoon:2000:EPD, author = "Brendon Cahoon and Kathryn S. McKinley and Zhihong Lu", title = "Evaluating the performance of distributed architectures for information retrieval using a variety of workloads", journal = j-TOIS, volume = "18", number = "1", pages = "1--43", month = jan, year = "2000", CODEN = "ATISET", ISSN = "1046-8188", ISSN-L = "0734-2047", bibdate = "Tue Sep 26 09:34:01 MDT 2000", bibsource = "http://www.acm.org/pubs/contents/journals/tois/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org/pubs/citations/journals/tois/2000-18-1/p1-cahoon/", abstract = "The information explosion across the Internet and elsewhere offers access to an increasing number of document collections. In order for users to effectively access these collections, information retrieval (IR) systems must provide coordinated, concurrent, and distributed access. In this article, we explore how to achieve scalable performance in a distributed system for collection sizes ranging from 1GB to 128GB. We implement a fully functional distributed IR system based on a multithreaded version of the Inquery simulation model. We measure performance as a function of system parameters such as client command rate, number of document collections, ter ms per query, query term frequency, number of answers returned, and command mixture. Our results show that it is important to model both query and document commands because the heterogeneity of commands significantly impacts performance. Based on our results, we recommend simple changes to the prototype and evaluate the changes using the simulator. Because of the significant resource demands of information retrieval, it is not difficult to generate workloads that overwhelm system resources regardless of the architecture. However under some realistic workloads, we demonstrate system organizations for which response time gracefully degrades as the workload increases and performance scales with the number of processors. This scalable architecture includes a surprisingly small number of brokers through which a large number of clients and servers communicate.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Information Systems", keywords = "distributed information retrieval architectures", subject = "Computer Systems Organization --- Computer-Communication Networks --- Distributed Systems (C.2.4); Computer Systems Organization --- Performance of Systems (C.4); Computer Systems Organization --- Performance of Systems (C.4): {\bf Performance attributes}; Information Systems --- Information Storage and Retrieval --- Systems and Software (H.3.4)", } @Article{Calkins:2000:ITT, author = "Charles Calkins", title = "Integrating Threads with Template Classes", journal = j-CCCUJ, volume = "18", number = "5", pages = "32--??", month = may, year = "2000", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Tue May 14 18:09:26 MDT 2002", bibsource = "http://www.cuj.com/articles/2000/0005/0005toc.htm?topic=articles; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "It's obviously a good idea to encapsulate a thread as an object. It is less obvious how to get all the interfaces right.", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @Article{Carr:2000:PCL, author = "Steve Carr and Ching-Kuang Shene", title = "A portable class library for teaching multithreaded programming", journal = j-SIGCSE, volume = "32", number = "3", pages = "124--127", month = sep, year = "2000", CODEN = "SIGSD3", DOI = "https://doi.org/10.1145/353519.343138", ISSN = "0097-8418 (print), 2331-3927 (electronic)", ISSN-L = "0097-8418", bibdate = "Sat Nov 17 16:56:43 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigcse2000.bib", acknowledgement = ack-nhfb, fjournal = "SIGCSE Bulletin (ACM Special Interest Group on Computer Science Education)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J688", } @Article{ChassindeKergommeaux:2000:PIV, author = "J. {Chassin de Kergommeaux} and B. Stein and P. E. Bernard", title = "{Paj{\'e}}, an interactive visualization tool for tuning multi-threaded parallel applications", journal = j-PARALLEL-COMPUTING, volume = "26", number = "10", pages = "1253--1274", day = "15", month = aug, year = "2000", CODEN = "PACOEJ", ISSN = "0167-8191 (print), 1872-7336 (electronic)", ISSN-L = "0167-8191", bibdate = "Sat Oct 28 17:44:14 MDT 2000", bibsource = "http://www.elsevier.com/locate/issn/01678191; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.elsevier.nl/gej-ng/10/35/21/42/31/24/abstract.html; http://www.elsevier.nl/gej-ng/10/35/21/42/31/24/article.pdf", acknowledgement = ack-nhfb, fjournal = "Parallel Computing", journal-URL = "http://www.sciencedirect.com/science/journal/01678191", } @Article{Choi:2000:SCP, author = "Sung-Eun Choi and E. Christopher Lewis", title = "A study of common pitfalls in simple multi-threaded programs", journal = j-SIGCSE, volume = "32", number = "1", pages = "325--329", month = mar, year = "2000", CODEN = "SIGSD3", DOI = "https://doi.org/10.1145/331795.331879", ISSN = "0097-8418 (print), 2331-3927 (electronic)", ISSN-L = "0097-8418", bibdate = "Mon Nov 19 10:05:03 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigcse2000.bib", abstract = "It is generally acknowledged that developing correct multi-threaded codes is difficult, because threads may interact with each other in unpredictable ways. The goal of this work is to discover common multi-threaded programming pitfalls, the knowledge of which will be useful in instructing new programmers and in developing tools to aid in multi-threaded programming. To this end, we study multi-threaded applications written by students from introductory operating systems courses. Although the applications are simple, careful inspection and the use of an automatic race detection tool reveal a surprising quantity and variety of synchronization errors. We describe and discuss these errors, evaluate the role of automated tools, and propose new tools for use in the instruction of multi-threaded programming.", acknowledgement = ack-nhfb, fjournal = "SIGCSE Bulletin (ACM Special Interest Group on Computer Science Education)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J688", } @Book{Christopher:2000:HPJ, author = "Thomas Christopher and George Thiruvathukal", title = "High Performance {Java} Platform Computing: Multithreaded and Networked Programming", publisher = pub-PH, address = pub-PH:adr, pages = "xxii + 409", year = "2000", ISBN = "0-13-016164-0", ISBN-13 = "978-0-13-016164-2", LCCN = "????", bibdate = "Tue Feb 20 18:03:50 2001", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", price = "US\$49.99", URL = "http://www.sun.com/books/catalog/christopher/", acknowledgement = ack-nhfb, } @Article{Corbett:2000:USA, author = "James C. Corbett", title = "Using shape analysis to reduce finite-state models of concurrent {Java} programs", journal = j-TOSEM, volume = "9", number = "1", pages = "51--93", month = jan, year = "2000", CODEN = "ATSMER", ISSN = "1049-331X (print), 1557-7392 (electronic)", ISSN-L = "1049-331X", bibdate = "Fri Apr 20 08:21:35 MDT 2001", bibsource = "http://www.acm.org/pubs/toc/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org/pubs/articles/journals/tosem/2000-9-1/p51-corbett/p51-corbett.pdf; http://www.acm.org/pubs/citations/journals/tosem/2000-9-1/p51-corbett/", abstract = "Finite-state verification (e.g., model checking) provides a powerful means to detect concurrency errors, which are often subtle and difficult to reproduce. Nevertheless, widespread use of this technology by developers is unlikely until tools provide automated support for extracting the required finite-state models directly from program source. Unfortunately, the dynamic features of modern languages such as Java complicate the construction of compact finite-state models for verification. In this article, we show how shape analysis, which has traditionally been used for computing alias information in optimizers, can be used to greatly reduce the size of finite-state models of concurrent Java programs by determining which heap-allocated variables are accessible only by a single thread, and which shared variables are protected by locks. We also provide several other state-space reductions based on the semantics of Java monitors. A prototype of the reductions demonstrates their effectiveness.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Software Engineering and Methodology", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J790", keywords = "concurrent systems; finite-state verification; Java; model extraction; modeling; shape analysis; state-space reductions", subject = "Software --- Software Engineering --- Software/Program Verification (D.2.4)", } @Article{Cui:2000:MPC, author = "J. Cui and J. L. Bordim and K. Nakano and T. Hayashi and N. Ishii", title = "Multithreaded Parallel Computer Model with Performance Evaluation", journal = j-LECT-NOTES-COMP-SCI, volume = "1800", pages = "155--??", year = "2000", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Fri Feb 1 09:16:18 MST 2002", bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t1800.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer-ny.com/link/service/series/0558/bibs/1800/18000155.htm; http://link.springer-ny.com/link/service/series/0558/papers/1800/18000155.pdf", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Danjean:2000:IKA, author = "Vincent Danjean and Raymond Namyst and Robert D. Russell", title = "Integrating Kernel Activations in a Multithreaded Runtime System on Top of {L} {INUX}", journal = j-LECT-NOTES-COMP-SCI, volume = "1800", pages = "1160--??", year = "2000", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Fri Feb 1 09:16:18 MST 2002", bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t1800.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer-ny.com/link/service/series/0558/bibs/1800/18001160.htm; http://link.springer-ny.com/link/service/series/0558/papers/1800/18001160.pdf", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Dill:2000:MCJ, author = "David Dill", title = "Model checking {Java} programs (abstract only)", journal = j-SIGSOFT, volume = "25", number = "5", pages = "179", month = sep, year = "2000", CODEN = "SFENDP", DOI = "https://doi.org/10.1145/347636.349113", ISSN = "0163-5948 (print), 1943-5843 (electronic)", ISSN-L = "0163-5948", bibdate = "Wed Aug 1 17:14:00 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/java2000.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigsoft2000.bib", abstract = "Automatic state exploration tools (model checkers) have had some success when applied to protocols and hardware designs, but there are fewer success stories about software. This is unfortunate, since the software problem is worsening even faster than the hardware and protocol problems. Model checking of concurrent programs is especially interesting, because they are notoriously difficult to test, analyze, and debug by other methods. This talk will be a description of our initial efforts to check Java programs using a model checker. The model checker supports dynamic allocation, thread creation, and recursive procedures (features that are not necessary for hardware verification), and has some special optimizations and checks tailored to multi-threaded Java program. I will also discuss some of the challenges for future efforts in this area.", acknowledgement = ack-nhfb, fjournal = "ACM SIGSOFT Software Engineering Notes", journal-URL = "https://dl.acm.org/citation.cfm?id=J728", } @Article{Duda:2000:BVT, author = "Kenneth J. Duda and David R. Cheriton", title = "Borrowed-virtual-time {(BVT)} scheduling: supporting latency-sensitive threads in a general-purpose scheduler", journal = j-OPER-SYS-REV, volume = "34", number = "2", pages = "27--28", month = apr, year = "2000", CODEN = "OSRED8", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:42 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @InProceedings{Engelschall:2000:PMS, author = "Ralf S. Engelschall", title = "Portable Multithreading --- The Signal Stack Trick for User-Space Thread Creation", crossref = "USENIX:2000:UAT", pages = "239--249", year = "2000", bibdate = "Tue Oct 15 09:53:32 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/usenix2000.bib", URL = "http://www.usenix.org/events/usenix2000/general/engelschall.html", acknowledgement = ack-nhfb, } @Article{Flautner:2000:TLPa, author = "Kristi{\'a}n Flautner and Rich Uhlig and Steve Reinhardt and Trevor Mudge", title = "Thread-level parallelism and interactive performance of desktop applications", journal = j-COMP-ARCH-NEWS, volume = "28", number = "5", pages = "129--138", month = dec, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Flautner:2000:TLPb, author = "Kriszti{\'a}n Flautner and Rich Uhlig and Steve Reinhardt and Trevor Mudge", title = "Thread Level Parallelism and Interactive Performance of Desktop Applications", journal = j-SIGPLAN, volume = "35", number = "11", pages = "129--138", month = nov, year = "2000", CODEN = "SINODQ", DOI = "https://doi.org/10.1145.357001", ISBN = "1-58113-317-0", ISBN-13 = "978-1-58113-317-2", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:18:19 MST 2003", bibsource = "http://foothill.lcs.mit.edu/asplos2k/program.html; http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://delivery.acm.org/10.1145/360000/357001/p129-flautner.pdf", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "IA-64", } @Article{Flautner:2000:TLPc, author = "Kristi{\'a}n Flautner and Rich Uhlig and Steve Reinhardt and Trevor Mudge", title = "Thread-level parallelism and interactive performance of desktop applications", journal = j-OPER-SYS-REV, volume = "34", number = "5", pages = "129--138", month = dec, year = "2000", CODEN = "OSRED8", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @Article{Garcia:2000:PTL, author = "Felix Garcia and Javier Fernandez", title = "{POSIX} Thread Libraries", journal = j-LINUX-J, volume = "70", pages = "??--??", month = feb, year = "2000", CODEN = "LIJOFX", ISSN = "1075-3583 (print), 1938-3827 (electronic)", ISSN-L = "1075-3583", bibdate = "Thu Sep 21 16:46:44 MDT 2000", bibsource = "http://noframes.linuxjournal.com/lj-issues/issue70/index.html; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://noframes.linuxjournal.com/lj-issues/issue/3184.html", acknowledgement = ack-nhfb, fjournal = "Linux journal", journal-URL = "http://portal.acm.org/citation.cfm?id=J508", } @Article{Geppert:2000:MBG, author = "L. Geppert", title = "Microprocessors: the off-beat generation", journal = j-IEEE-SPECTRUM, volume = "37", number = "7", pages = "44--49", month = jul, year = "2000", CODEN = "IEESAM", DOI = "https://doi.org/10.1109/6.852051", ISSN = "0018-9235 (print), 1939-9340 (electronic)", ISSN-L = "0018-9235", bibdate = "Sat Jan 18 12:29:46 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeespectrum2000.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Spectrum", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=6", keywords = "Biology computing; Bonding; Broadband communication; broadband networks; Electronics industry; microprocessor chips; microprocessors; Microprocessors; multimedia broadband communications; multimedia communication; multimedia computing; Multithreading; off-beat generation; performance; Personal communication networks; programmable controllers; programmable logic; Real time systems; Supercomputers; supercomputing; Workstations", } @Article{Gontmakher:2000:JCN, author = "Alex Gontmakher and Assaf Schuster", title = "{Java} consistency: nonoperational characterizations for {Java} memory behavior", journal = j-TOCS, volume = "18", number = "4", pages = "333--386", year = "2000", CODEN = "ACSYEC", ISSN = "0734-2071 (print), 1557-7333 (electronic)", ISSN-L = "0734-2071", bibdate = "Wed Jul 18 10:18:45 MDT 2001", bibsource = "http://www.acm.org/pubs/toc/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org/pubs/articles/journals/tocs/2000-18-4/p333-gontmakher/p333-gontmakher.pdf; http://www.acm.org/pubs/citations/journals/tocs/2000-18-4/p333-gontmakher/", abstract = "The Java Language Specification (JLS) [Gosling et al. 1996] provides an operational definition for the consistency of shared variables. The definition remains unchanged in the JLS 2nd edition, currently under peer review, which relies on a specific abstract machine as its underlying model, is very complicated. Several subsequent works have tried to simplify and formalize it. However, these revised definitions are also operational, and thus have failed to highlight the intuition behind the original specification. In this work we provide a complete nonoperational specification for Java and for the JVM, excluding synchronized operations. We provide a simpler definition, in which we clearly distinguish the consistency model that is promised to the programmer from that which should be implemented in the JVM. This distinction, which was implicit in the original definition, is crucial for building the JVM. We find that the programmer model is strictly weaker than that of the JVM, and precisely define their discrepancy. Moreover, our definition is independent of any specific (or even abstract) machine, and can thus be used to verify JVM implementations and compiler optimizations on any platform. Finally, we show the precise range of consistency relaxations obtainable for the Java memory model when a certain compiler optimization-- called {\em prescient stores\/} in JLS--is applicable.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Computer Systems", generalterms = "Verification", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J774", keywords = "Java memory models; multithreading; nonoperational specification", subject = "Hardware --- Memory Structures --- Performance Analysis and Design Aids** (B.3.3): {\bf Formal models**}", } @Article{Gopinath:2000:PSB, author = "K. Gopinath and M. K. Krishna Narasimhan", title = "Performance of Switch Blocking on Multithreaded Architectures", journal = j-J-UCS, volume = "6", number = "10", pages = "928--947", day = "28", month = oct, year = "2000", CODEN = "????", ISSN = "0948-695X (print), 0948-6968 (electronic)", ISSN-L = "0948-6968", bibdate = "Wed Feb 20 07:23:07 MST 2002", bibsource = "http://www.jucs.org/jucs; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.jucs.org/jucs_6_10/performance_of_switch_blocking", acknowledgement = ack-nhfb, fjournal = "J.UCS: Journal of Universal Computer Science", journal-URL = "http://www.jucs.org/jucs", } @Book{Holub:2000:TJT, author = "Allen I. Holub", title = "Taming {Java} Threads", publisher = pub-APRESS, address = pub-APRESS:adr, pages = "x + 300", year = "2000", ISBN = "1-893115-10-0", ISBN-13 = "978-1-893115-10-1", LCCN = "QA76.73.J38 H635 2000", bibdate = "Fri May 10 12:18:17 MDT 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www1.fatbrain.com/asp/bookinfo/bookinfo.asp?theisbn=1893115100&from=NCN454", price = "US\$34.95", acknowledgement = ack-nhfb, keywords = "Java (computer program language); threads (computer programs)", } @Article{Horwood:2000:DMA, author = "Peter Horwood and Shlomo Wygodny and Martin Zardecki", title = "Debugging Multithreaded Applications", journal = j-DDJ, volume = "25", number = "3", pages = "32, 34--37", month = mar, year = "2000", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Thu Nov 9 08:25:14 MST 2000", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.ddj.com/ftp/2000/2000_03/dbgmulti.txt", abstract = "It is often significantly harder to locate and test for bugs in multithreaded and multiprocess applications than for nonthreaded, single process situations. Our authors describe some of the problems with multithreaded applications and discuss common debugging techniques. Additional resources include dbgmulti.txt (listings).", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @Article{Howard:2000:UPW, author = "David M. Howard", title = "Using Predicate Waits with {Win32} Threads", journal = j-CCCUJ, volume = "18", number = "5", pages = "18--??", month = may, year = "2000", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Tue May 14 18:09:26 MDT 2002", bibsource = "http://www.cuj.com/articles/2000/0005/0005toc.htm?topic=articles; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Most Win32 synchronization primitives are just that --- primitive. But you can use them to build queues that are safe and easy to use.", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @Book{Hyde:2000:JTP, author = "Paul Hyde", title = "{Java} thread programming", publisher = pub-SAMS, address = pub-SAMS:adr, pages = "iv + 510", year = "2000", ISBN = "0-672-31585-8", ISBN-13 = "978-0-672-31585-5", LCCN = "QA76.73.J38 H93 1999", bibdate = "Wed Feb 21 06:02:14 2001", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Keller:2000:JUS, author = "J. Keller and T. Ungerer", title = "{J.UCS} Special Issue on Multithreaded Processors and Chip-Multiprocessors", journal = j-J-UCS, volume = "6", number = "10", pages = "906--907", day = "28", month = oct, year = "2000", CODEN = "????", ISSN = "0948-695X (print), 0948-6968 (electronic)", ISSN-L = "0948-6968", bibdate = "Wed Feb 20 07:23:07 MST 2002", bibsource = "http://www.jucs.org/jucs; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.jucs.org/jucs_6_10/j_ucs_special_issue", acknowledgement = ack-nhfb, fjournal = "J.UCS: Journal of Universal Computer Science", journal-URL = "http://www.jucs.org/jucs", } @Article{Kleber:2000:TSA, author = "Jeff Kleber", title = "Thread-Safe Access to Collections", journal = j-CCCUJ, volume = "18", number = "5", pages = "36--??", month = may, year = "2000", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Tue May 14 18:09:26 MDT 2002", bibsource = "http://www.cuj.com/articles/2000/0005/0005toc.htm?topic=articles; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The best place to store a thread lock for a shared container is somewhere inside the container --- deep inside.", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @Article{Lafreniere:2000:SMD, author = "David Lafreniere", title = "State Machine Design in {C++}", journal = j-CCCUJ, volume = "18", number = "5", pages = "58--??", month = may, year = "2000", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Tue May 14 18:09:26 MDT 2002", bibsource = "http://www.cuj.com/articles/2000/0005/0005toc.htm?topic=articles; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "It's not all that hard to implement a finite-state machine, unless it's very large, and you have to worry about multithreading, and \ldots{}.", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @Book{Lewis:2000:MPJ, author = "Bil Lewis and Daniel J. Berg", title = "Multithreaded Programming with {Java} Technology", publisher = pub-SUN-MICROSYSTEMS-PRESS, address = pub-SUN-MICROSYSTEMS-PRESS:adr, pages = "xxv + 461", year = "2000", ISBN = "0-13-017007-0", ISBN-13 = "978-0-13-017007-1", LCCN = "QA76.73.J38 L488 2000", bibdate = "Fri Apr 11 15:58:52 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", price = "US\$39.99", series = "Sun BluePrints Program", URL = "http://www.sun.com/books/catalog/lewis3/index.html", acknowledgement = ack-nhfb, } @Article{Ling:2000:AOT, author = "Yibei Ling and Tracy Mullen and Xiaola Lin", title = "Analysis of optimal thread pool size", journal = j-OPER-SYS-REV, volume = "34", number = "2", pages = "42--55", month = apr, year = "2000", CODEN = "OSRED8", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:42 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @Article{Lowy:2000:MPO, author = "Juval Lowy", title = "Making Primitive Objects Thread Safe", journal = j-CCCUJ, volume = "18", number = "3", pages = "85--??", month = mar, year = "2000", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Tue May 14 18:09:25 MDT 2002", bibsource = "http://www.cuj.com/articles/2000/0003/0003toc.htm?topic=articles; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "All sorts of things need thread locks. A fairly simple template or two can do the job.", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @InProceedings{Matsushita:2000:MSC, author = "Satoshi Matsushita and Sunao Torii and Masahiko Nomura and Toshiaki Inoue and Atsufumi Shibayama and Sachiko Shimada and Taku Osawa and Hiroaki Inoue and Kouichiro Minami and Junji Sakai and Yoshiyuki Ito and Yuichi Nakamura and Masato Edahiro and Naoki Nishi and Masakazu Yamashina", title = "{Merlot}: a Single-Chip Tightly Coupled Four-Way Multi-Thread Processor", crossref = "Anonymous:2000:CCI", pages = "??--??", year = "2000", bibdate = "Mon Jan 08 05:28:04 2001", bibsource = "http://www.coolchips.org/index-cool3.html; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "We developed an on-chip four-way multiprocessor, MP98 version 1, code-named Merlot. It is fabricated with a 0.15 $ \mu $ m process and has a die size of 110 mm2. Merlot is a high performance embedded processor for intelligent appliances. We extract a higher degree of parallelism with low voltage operation. In our presentation, we describe our multi-threading model. Then, we explain Merlot's pipeline architecture, focusing on fast thread creation and memory renaming. We also describe our on-chip SDRAM interface which has a throughput greater than 1 GB/sec and cache miss penalty less than 100 ns. Finally, we show a performance estimation for speech recognition and MPEG2 code, power dissipation, and average memory latency. Restructured speech recognition code was compiled with directives, and IPC of 2.72 is estimated.", acknowledgement = ack-nhfb, } @Article{Metzner:2000:MMR, author = "A. Metzner and J. Niehaus", title = "{MSparc}: Multithreading in Real-Time Architectures", journal = j-J-UCS, volume = "6", number = "10", pages = "1034--1051", day = "28", month = oct, year = "2000", CODEN = "????", ISSN = "0948-695X (print), 0948-6968 (electronic)", ISSN-L = "0948-6968", bibdate = "Wed Feb 20 07:23:07 MST 2002", bibsource = "http://www.jucs.org/jucs; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.jucs.org/jucs_6_10/msparc_multithreading_in_real", acknowledgement = ack-nhfb, fjournal = "J.UCS: Journal of Universal Computer Science", journal-URL = "http://www.jucs.org/jucs", } @Article{Mohamed:2000:DDM, author = "A. S. Mohamed and A. Galal and I. Khalil and K. Sobh and M. Selim", title = "{Dispo}: Distributed Multi-Threaded Execution of {Prolog} Programs", journal = j-INT-J-COMPUT-APPL, volume = "22", number = "2", pages = "100--108", year = "2000", DOI = "https://doi.org/10.1080/1206212X.2000.11441606", ISSN = "1206-212X (print), 1925-7074 (electronic)", ISSN-L = "1206-212X", bibdate = "Sat Apr 21 17:19:15 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/ijca.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "https://www.tandfonline.com/doi/full/10.1080/1206212X.2000.11441606", acknowledgement = ack-nhfb, fjournal = "International Journal of Computers and Applications", journal-URL = "https://www.tandfonline.com/loi/tjca20", online-date = "10 Jul 2015", } @Article{Mount:2000:ADP, author = "John Mount", title = "Automatic Detection Of Potential Deadlock", journal = j-DDJ, volume = "25", number = "12", pages = "64, 66--70, 72", month = dec, year = "2000", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Wed Nov 8 15:09:25 MST 2000", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.ddj.com/ftp/2000/2000_12/deadlock.txt; http://www.ddj.com/ftp/2000/2000_12/deadlock.zip", abstract = "Deadlock can occur when a number of consumers (typically threads) access a set of resources in an unacceptable pattern. To combat it, John presents a solution based on run-time lock analysis that analyzes all transactions. Additional resources include deadlock.txt (listings) and deadlock.zip (source code).", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @Article{Nemeth:2000:AMD, author = "Zsolt N{\'e}meth", title = "Abstract machine design on a multithreaded architecture", journal = j-FUT-GEN-COMP-SYS, volume = "16", number = "6", pages = "705--716", month = apr, year = "2000", CODEN = "FGSEVI", ISSN = "0167-739X (print), 1872-7115 (electronic)", ISSN-L = "0167-739X", bibdate = "Wed Feb 27 12:41:20 MST 2002", bibsource = "http://www.elsevier.com/locate/issn/0167739X; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.elsevier.com/gej-ng/10/19/19/41/29/36/abstract.html", acknowledgement = ack-nhfb, fjournal = "Future Generation Computer Systems", journal-URL = "http://www.sciencedirect.com/science/journal/0167739X", } @Article{Nielsen:2000:MTN, author = "Ida M. B. Nielsen and Curtis L. Janssen", title = "Multi-threading: a new dimension to massively parallel scientific computation", journal = j-COMP-PHYS-COMM, volume = "128", number = "1--2", pages = "238--244", day = "9", month = jun, year = "2000", CODEN = "CPHCBZ", DOI = "https://doi.org/10.1016/S0010-4655(00)00062-X", ISSN = "0010-4655 (print), 1879-2944 (electronic)", ISSN-L = "0010-4655", bibdate = "Mon Feb 13 23:40:43 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/compphyscomm2000.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.sciencedirect.com/science/article/pii/S001046550000062X", acknowledgement = ack-nhfb, fjournal = "Computer Physics Communications", journal-URL = "http://www.sciencedirect.com/science/journal/00104655", } @Article{Oyama:2000:OCC, author = "Yoshihiro Oyama and Kenjiro Taura and Akinori Yonezawa", title = "Online Computation of Critical Paths for Multithreaded Languages", journal = j-LECT-NOTES-COMP-SCI, volume = "1800", pages = "301--??", year = "2000", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Fri Feb 1 09:16:18 MST 2002", bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t1800.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer-ny.com/link/service/series/0558/bibs/1800/18000301.htm; http://link.springer-ny.com/link/service/series/0558/papers/1800/18000301.pdf", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Peterson:2000:CCT, author = "Mark Peterson", title = "{C/C++} Tips: Tip \#4: Self Destructing Threads", journal = j-CCCUJ, volume = "18", number = "12", pages = "44--??", month = dec, year = "2000", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Tue May 14 18:09:29 MDT 2002", bibsource = "http://www.cuj.com/articles/2000/0012/0012toc.htm?topic=articles; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "A way to make threads easier to manage.", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @Article{Pulleyn:2000:EPM, author = "Ivan Pulleyn", title = "Embedding {Python} in Multi-Threaded {C\slash C++} Applications", journal = j-LINUX-J, volume = "73", pages = "??--??", month = may, year = "2000", CODEN = "LIJOFX", ISSN = "1075-3583 (print), 1938-3827 (electronic)", ISSN-L = "1075-3583", bibdate = "Thu Sep 21 07:44:12 MDT 2000", bibsource = "http://noframes.linuxjournal.com/lj-issues/issue73/index.html; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Python provides a clean intuitive interface to complex,threaded applications.", acknowledgement = ack-nhfb, fjournal = "Linux journal", journal-URL = "http://portal.acm.org/citation.cfm?id=J508", } @InProceedings{Ranganathan:2000:AMT, author = "M. Ranganathan and Mark Bednarek and Fernand Pors and Doug Montgomery", title = "{AGNI}: a Multi-threaded Middleware for Distributed Scripting", crossref = "USENIX:2000:PUT", pages = "??--??", year = "2000", bibdate = "Wed Oct 16 05:17:16 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/usenix2000.bib", URL = "http://db.usenix.org/publications/library/proceedings/tcl2k/ranganathan.html", acknowledgement = ack-nhfb, } @Article{Redstone:2000:AOSa, author = "Joshua A. Redstone and Susan J. Eggers and Henry M. Levy", title = "An analysis of operating system behavior on a simultaneous multithreaded architecture", journal = j-COMP-ARCH-NEWS, volume = "28", number = "5", pages = "245--256", month = dec, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Redstone:2000:AOSb, author = "Joshua A. Redstone and Susan J. Eggers and Henry M. Levy", title = "An Analysis of Operating System Behavior on a Simultaneous Multithreaded Architecture", journal = j-SIGPLAN, volume = "35", number = "11", pages = "245--256", month = nov, year = "2000", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:18:19 MST 2003", bibsource = "http://foothill.lcs.mit.edu/asplos2k/program.html; http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Redstone:2000:AOSc, author = "Joshua A. Redstone and Susan J. Eggers and Henry M. Levy", title = "An analysis of operating system behavior on a simultaneous multithreaded architecture", journal = j-OPER-SYS-REV, volume = "34", number = "5", pages = "245--256", month = dec, year = "2000", CODEN = "OSRED8", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGOPS Operating Systems Review", } @Article{Reinhardt:2000:TFD, author = "Steven K. Reinhardt and Shubhendu S. Mukherjee", title = "Transient fault detection via simultaneous multithreading", journal = j-COMP-ARCH-NEWS, volume = "28", number = "2", pages = "25--36", month = may, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Samorodin:2000:SFS, author = "Steven H. Samorodin and Raju Pandey", title = "Supporting Flexible Safety and Sharing in Multi-threaded Environments", journal = j-LECT-NOTES-COMP-SCI, volume = "1800", pages = "1184--??", year = "2000", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Fri Feb 1 09:16:18 MST 2002", bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t1800.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer-ny.com/link/service/series/0558/bibs/1800/18001184.htm; http://link.springer-ny.com/link/service/series/0558/papers/1800/18001184.pdf", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Shinjo:2000:DCEa, author = "Yasushi Shinjo and Calton Pu", title = "Developing correct and efficient multithreaded programs with thread-specific data and a partial evaluator", journal = j-OPER-SYS-REV, volume = "34", number = "2", pages = "33--33", month = apr, year = "2000", CODEN = "OSRED8", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:42 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @Article{Shinjo:2000:DCEb, author = "Yasushi Shinjo", title = "Developing correct and efficient multithreaded programs with thread-specific data and a partial evaluator", journal = j-OPER-SYS-REV, volume = "34", number = "2", pages = "40--40", month = apr, year = "2000", CODEN = "OSRED8", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:42 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @Article{Snavely:2000:SJSa, author = "Allan Snavely and Dean M. Tullsen", title = "Symbiotic job scheduling for a simultaneous multithreaded processor", journal = j-COMP-ARCH-NEWS, volume = "28", number = "5", pages = "234--244", month = dec, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Snavely:2000:SJSb, author = "Allan Snavely and Dean M. Tullsen", title = "Symbiotic Jobscheduling for a Simultaneous Multithreading Processor", journal = j-SIGPLAN, volume = "35", number = "11", pages = "234--244", month = nov, year = "2000", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:18:19 MST 2003", bibsource = "http://foothill.lcs.mit.edu/asplos2k/program.html; http://portal.acm.org/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Snavely:2000:SJSc, author = "Allan Snavely and Dean M. Tullsen", title = "Symbiotic jobscheduling for a simultaneous multithreaded processor", journal = j-OPER-SYS-REV, volume = "34", number = "5", pages = "234--244", month = dec, year = "2000", CODEN = "OSRED8", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGOPS Operating Systems Review", } @Article{Steffan:2000:SAT, author = "J. Greggory Steffan and Christopher B. Colohan and Antonia Zhai and Todd C. Mowry", title = "A scalable approach to thread-level speculation", journal = j-COMP-ARCH-NEWS, volume = "28", number = "2", pages = "1--12", month = may, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Tan:2000:PEN, author = "Kian-Lee Tan and Cheng Hian Goh and Beng Chin Ooi", title = "Progressive evaluation of nested aggregate queries", journal = j-VLDB-J, volume = "9", number = "3", pages = "261--278", month = dec, year = "2000", CODEN = "VLDBFR", ISSN = "1066-8888 (print), 0949-877X (electronic)", ISSN-L = "1066-8888", bibdate = "Mon Jun 23 10:50:54 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "In many decision-making scenarios, decision makers require rapid feedback to their queries, which typically involve aggregates. The traditional {\em blocking execution model\/} can no longer meet the demands of these users. One promising approach in the literature, called {\em online aggregation}, evaluates an aggregation query progressively as follows: as soon as certain data have been evaluated, approximate answers are produced with their respective running confidence intervals; as more data are examined, the answers and their corresponding running confidence intervals are refined. In this paper, we extend this approach to handle nested queries with aggregates (i.e., at least one inner query block is an aggregate query) by providing users with (approximate) answers progressively as the inner aggregation query blocks are evaluated. We address the new issues pose by nested queries. In particular, the answer space begins with a superset of the final answers and is refined as the aggregates from the inner query blocks are refined. For the intermediary answers to be meaningful, they have to be interpreted with the aggregates from the inner queries. We also propose a {\em multi-threaded model\/} in evaluating such queries: each query block is assigned to a thread, and the threads can be evaluated concurrently and independently. The time slice across the threads is {\em nondeterministic\/} in the sense that the user controls the relative rate at which these subqueries are being evaluated. For {\em enumerative\/} nested queries, we propose a priority-based evaluation strategy to present answers that are certainly in the final answer space first, before presenting those whose validity may be affected as the inner query aggregates are refined. We implemented a prototype system using Java and evaluated our system. Results for nested queries with a level and multiple levels of nesting are reported. Our results show the effectiveness of the proposed mechanisms in providing progressive feedback that reduces the initial waiting time of users significantly without sacrificing the quality of the answers.", acknowledgement = ack-nhfb, fjournal = "VLDB Journal: Very Large Data Bases", journal-URL = "http://portal.acm.org/toc.cfm?id=J869", keywords = "approximate answers; multi-threading; nested aggregate queries; online aggregation; progressive query processing", } @Article{Tang:2000:PTR, author = "Hong Tang and Kai Shen and Tao Yang", title = "Program transformation and runtime support for threaded {MPI} execution on shared-memory machines", journal = j-TOPLAS, volume = "22", number = "4", pages = "673--700", year = "2000", CODEN = "ATPSDT", ISSN = "0164-0925 (print), 1558-4593 (electronic)", ISSN-L = "0164-0925", bibdate = "Tue Apr 17 10:05:24 MDT 2001", bibsource = "http://www.acm.org/pubs/toc/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org/pubs/citations/journals/toplas/2000-22-4/p673-tang/", abstract = "Parallel programs written in MPI have been widely used for developing high-performance applications on various platforms. Because of a restriction of the MPI computation model, conventional MPI implementations on shared-memory machines map each MPI node to an OS process, which can suffer serious performance degradation in the presence of multiprogramming. This paper studies compile-time and runtime techniques for enhancing performance portability of MPI code running on multiprogrammed shared-memory machines. The proposed techniques allow MPI nodes to be executed safety and efficiently as threads. Compile-time transformation eliminates global and static variables in C code using node-specific data. The runtime support includes an efficient and provably correct communication protocol that uses lock-free data structure and takes advantage of address space sharing among threads. The experiments on SGI Origin 2000 show that our MPI prototype called TMPI using the proposed techniques is competitive with SGI's native MPI implementation in a dedicated environment, and that it has significant performance advantages in a multiprogrammed environment.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Programming Languages and Systems", generalterms = "Algorithms; Design; Experimentation; Languages; Performance", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783", keywords = "lock-free synchronization; MPI; multiprogrammed environments; program transformation; shared-memory machines; threaded execution", subject = "Hardware --- Memory Structures --- Design Styles (B.3.2): {\bf Shared memory}; Software --- Programming Techniques --- Concurrent Programming (D.1.3): {\bf Parallel programming}; Software --- Programming Languages --- Language Classifications (D.3.2): {\bf Concurrent, distributed, and parallel languages}; Software --- Programming Languages --- Processors (D.3.4): {\bf Preprocessors}; Software --- Programming Languages --- Processors (D.3.4): {\bf Run-time environments}; Software --- Operating Systems --- Process Management (D.4.1): {\bf Multiprocessing/multiprogramming/multitasking}; Data --- Data Structures (E.1): {\bf Lists, stacks, and queues}", } @InProceedings{Theobald:2000:LCE, author = "Kevin B. Theobald and Gagan Agrawal and Rishi Kumar and Gerd Heber and Guang R. Gao and Paul Stodghill and Keshav Pingali", title = "Landing {CG} on {EARTH}: a Case Study of Fine-Grained Multithreading on an Evolutionary Path", crossref = "ACM:2000:SHP", pages = "47--47", year = "2000", bibdate = "Mon Feb 12 11:57:42 2001", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.sc2000.org/proceedings/techpapr/papers/pap293.pdf", acknowledgement = ack-nhfb, } @Article{Unger:2000:CCA, author = "A. Unger and E. Zehendner and Th. Ungerer", title = "A combined compiler and architecture technique to control multithreaded execution of branches and loop iterations", journal = j-COMP-ARCH-NEWS, volume = "28", number = "1", pages = "53--61", month = mar, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:36 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @InProceedings{Vckovski:2000:MTS, author = "Andrej Vckovski and Jason Brazile", title = "A Multi-Threaded Server for Shared Hash Table Access", crossref = "USENIX:2000:PUT", pages = "??--??", year = "2000", bibdate = "Wed Oct 16 05:17:16 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/usenix2000.bib", URL = "http://db.usenix.org/publications/library/proceedings/tcl2k/vckovski.html", acknowledgement = ack-nhfb, } @Article{Vishkin:2000:ELR, author = "Dascal Vishkin and Uzi Vishkin", title = "Experiments with list ranking for explicit multi-threaded {(XMT)} instruction parallelism", journal = j-ACM-J-EXP-ALGORITHMICS, volume = "5", pages = "10:1--10:??", month = "????", year = "2000", CODEN = "????", DOI = "https://doi.org/10.1145/351827.384252", ISSN = "1084-6654", bibdate = "Mon Oct 6 16:03:09 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Algorithms for the problem of list ranking are empirically studied with respect to the Explicit Multi-Threaded (XMT) platform for instruction-level parallelism (ILP). The main goal of this study is to understand the differences between XMT and more traditional parallel computing implementation platforms/models as they pertain to the well studied list ranking problem. The main two findings are: (i) good speedups for much smaller inputs are possible and (ii) in part, the first finding is based on a new variant of a 1984 algorithm, called the No-Cut algorithm. The paper incorporates analytic (non-asymptotic) performance analysis into experimental performance analysis for relatively small inputs. This provides an interesting example where experimental research and theoretical analysis complement one another. Explicit Multi-Threading (XMT) is a fine-grained computation framework introduced in our SPAA'98 paper. Building on some key ideas of parallel computing, XMT covers the spectrum from algorithms through architecture to implementation; the main implementation related innovation in XMT was through the incorporation of low-overhead hardware and software mechanisms (for more effective fine-grained parallelism). The reader is referred to that paper for detail on these mechanisms. The XMT platform aims at faster single-task completion time by way of ILP.", acknowledgement = ack-nhfb, articleno = "10", fjournal = "Journal of Experimental Algorithmics (JEA)", } @Book{Walmsley:2000:MTP, author = "Mark Walmsley", title = "Multi-threaded programming in {C++}", publisher = pub-SV, address = pub-SV:adr, pages = "x + 223", year = "2000", ISBN = "1-85233-146-1", ISBN-13 = "978-1-85233-146-7", LCCN = "QA76.73.C153 W3148 2000", bibdate = "Sat Apr 20 11:14:00 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", price = "US\$49.95", acknowledgement = ack-nhfb, } @Article{Wilson:2000:PBC, author = "Gregory V. Wilson", title = "Programmer's Bookshelf: Classics Old and New", journal = j-DDJ, volume = "25", number = "11", pages = "159--160", month = nov, year = "2000", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Wed Nov 8 15:09:25 MST 2000", bibsource = "http://www.ddj.com/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This month Greg looks at Programming Pearls, Second Edition, by Jon Bentley; Foundations of Multithreaded, Parallel, and Distributing Programming, by Gregory R. Andrews; GUI Bloopers, by Jeff Johnson; The Humane Interface, by Jef Raskin; Legal Battles That Shaped the Software Industry, by Lawrence D. Graham; The World of Scripting Languages, by David Barron; C for Java Programmers, by Tomasz Muldner; and XML Elements of Style, by Simon St. Laurent.", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @Article{Zhang:2000:WMH, author = "Peter Zhang", title = "{Webrelay}: a Multithreaded {HTTP} Relay Server", journal = j-DDJ, volume = "25", number = "2", pages = "86, 88, 90--94, 96", month = feb, year = "2000", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Thu Nov 9 08:25:13 MST 2000", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.ddj.com/ftp/2000/2000_02/webrelay.txt; http://www.ddj.com/ftp/2000/2000_02/webrelay.zip", abstract = "Webrelay is a freely available multithreaded HTTP relay server that authenticates that clients are legitimate users before they are connected to vendor web servers. Additional resources include webrelay.txt (listings) and webrelay.zip (source code).", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @Article{Anonymous:2001:ESM, author = "Anonymous", title = "Errata: {``Speculative Multithreaded Processors''}", journal = j-COMPUTER, volume = "34", number = "5", pages = "7--7", month = may, year = "2001", CODEN = "CPTRB4", ISSN = "0018-9162 (print), 1558-0814 (electronic)", ISSN-L = "0018-9162", bibdate = "Fri May 4 17:53:39 MDT 2001", bibsource = "https://www.math.utah.edu/pub/tex/bib/computer2000.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "See \cite{Sohi:2001:SMP}.", URL = "http://dlib.computer.org/co/books/co2001/pdf/r5004.pdf", acknowledgement = ack-nhfb, fjournal = "Computer", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2", } @Article{Antoniu:2001:CMJ, author = "Gabriel Antoniu and Luc Boug{\'e} and Philip Hatcher and Mark MacBeth and Keith McGuigan and Raymond Namyst", title = "Compiling Multithreaded {Java} Bytecode for Distributed Execution (Distinguished Paper)", journal = j-LECT-NOTES-COMP-SCI, volume = "1900", pages = "1039--??", year = "2001", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Sat Feb 2 13:02:44 MST 2002", bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t1900.htm; https://www.math.utah.edu/pub/tex/bib/java2000.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer-ny.com/link/service/series/0558/bibs/1900/19001039.htm; http://link.springer-ny.com/link/service/series/0558/papers/1900/19001039.pdf", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Antoniu:2001:DPP, author = "Gabriel Antoniu and Luc Boug{\'e}", title = "{DSM-PM2}: a Portable Implementation Platform for Multithreaded {DSM} Consistency Protocols", journal = j-LECT-NOTES-COMP-SCI, volume = "2026", pages = "55--??", year = "2001", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Sat Feb 2 13:03:43 MST 2002", bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t2026.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer-ny.com/link/service/series/0558/bibs/2026/20260055.htm; http://link.springer-ny.com/link/service/series/0558/papers/2026/20260055.pdf", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Antoniu:2001:HSC, author = "Gabriel Antoniu and others", title = "The {Hyperion} system: {Compiling} multithreaded {Java} bytecode for distributed execution", journal = j-PARALLEL-COMPUTING, volume = "27", number = "10", pages = "1279--1297", month = sep, year = "2001", CODEN = "PACOEJ", ISSN = "0167-8191 (print), 1872-7336 (electronic)", ISSN-L = "0167-8191", bibdate = "Fri Feb 22 16:52:42 MST 2002", bibsource = "http://www.elsevier.com/locate/issn/01678191; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.elsevier.com/gej-ng/10/35/21/47/40/27/abstract.html; http://www.elsevier.nl/gej-ng/10/35/21/47/40/27/article.pdf", acknowledgement = ack-nhfb, fjournal = "Parallel Computing", journal-URL = "http://www.sciencedirect.com/science/journal/01678191", } @Article{Attali:2001:GVJ, author = "Isabelle Attali and Denis Caromel and Marjorie Russo", title = "Graphical Visualization of {Java} Objects, Threads, and Locks", journal = j-IEEE-DISTRIB-SYST-ONLINE, volume = "2", number = "1", year = "2001", ISSN = "1541-4922 (print), 1558-1683 (electronic)", ISSN-L = "1541-4922", bibdate = "Wed Oct 23 17:47:56 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://dsonline.computer.org/0101/features/att0101_print.htm", acknowledgement = ack-nhfb, fjournal = "IEEE Distributed Systems Online", } @Article{Ball:2001:PVM, author = "Thomas Ball and Sagar Chaki and Sriram K. Rajamani", title = "Parameterized Verification of Multithreaded Software Libraries", journal = j-LECT-NOTES-COMP-SCI, volume = "2031", pages = "158--??", year = "2001", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Sat Feb 2 13:03:48 MST 2002", bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t2031.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer-ny.com/link/service/series/0558/bibs/2031/20310158.htm; http://link.springer-ny.com/link/service/series/0558/papers/2031/20310158.pdf", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Becker:2001:SMW, author = "Thomas Becker", title = "Synchronization Monitors For {Win32}", journal = j-DDJ, volume = "26", number = "12", pages = "46, 48, 50--52, 54", month = dec, year = "2001", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Tue Feb 12 05:21:41 MST 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.ddj.com/ftp/2001/2001_12/monitor.txt; http://www.ddj.com/ftp/2001/2001_12/monitor.zip", abstract = "Thomas presents a Java-style synchronization monitor for multithreaded Win32 development. Additional resources include {\tt monitor.txt} (listings) and {\tt monitor.zip} (source code).", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @Article{Broberg:2001:POU, author = "Magnus Broberg and Lars Lundberg and H{\aa}kan Grahn", title = "Performance Optimization Using Extended Critical Path Analysis in Multithreaded Programs on Multiprocessors", journal = j-J-PAR-DIST-COMP, volume = "61", number = "1", pages = "115--136", day = "1", month = jan, year = "2001", CODEN = "JPDCER", DOI = "https://doi.org/10.1006/jpdc.2000.1667", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Fri Feb 22 15:30:35 MST 2002", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; https://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.2000.1667; http://www.idealibrary.com/links/doi/10.1006/jpdc.2000.1667/pdf; http://www.idealibrary.com/links/doi/10.1006/jpdc.2000.1667/ref", acknowledgement = ack-nhfb, fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315", } @Article{Brunst:2001:GBP, author = "Holger Brunst and Wolfgang E. Nagel and Hans-Christian Hoppe", title = "Group-Based Performance Analysis for Multithreaded {SMP} Cluster Applications", journal = j-LECT-NOTES-COMP-SCI, volume = "2150", pages = "148--??", year = "2001", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Sat Feb 2 13:05:53 MST 2002", bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t2150.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer-ny.com/link/service/series/0558/bibs/2150/21500148.htm; http://link.springer-ny.com/link/service/series/0558/papers/2150/21500148.pdf", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Bull:2001:MSO, author = "J. Mark Bull and Darragh O'Neill", title = "A microbenchmark suite for {OpenMP 2.0}", journal = j-COMP-ARCH-NEWS, volume = "29", number = "5", pages = "41--48", month = dec, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Book{Chandra:2001:PPO, author = "Rohit Chandra and Leonardo Dagum and David Kohr and Dror Maydan and Jeff McDonald and Ramesh Menon", title = "Parallel Programming in {OpenMP}", publisher = pub-MORGAN-KAUFMANN, address = pub-MORGAN-KAUFMANN:adr, pages = "xvi + 230", year = "2001", ISBN = "1-55860-671-8", ISBN-13 = "978-1-55860-671-5", LCCN = "QA76.642 .P38 2001", bibdate = "Thu Jul 14 11:09:17 2005", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/pvm.bib; https://www.math.utah.edu/pub/tex/bib/unix.bib", price = "US\$39.95", URL = "http://www.mkp.com/books_catalog/catalog.asp?ISBN=1-55860-671-8", abstract = "The rapid and widespread acceptance of shared memory multiprocessor architectures has created a pressing demand for an efficient way to program these systems. At the same time, developers of technical and scientific applications in industry and in government laboratories find they need to parallelize huge volumes of code in a portable fashion. OpenMP, developed jointly by several parallel computing vendors to address these issues, is an industry-wide standard for programming shared-memory and distributed shared-memory multiprocessors. It consists of a set of compiler directives and library routines that extend FORTRAN, C, and C++ codes to express shared-memory parallelism. Parallel Programming in OpenMP is the first book to teach both the novice and expert parallel programmers how to program using this new standard. The authors, who helped design and implement OpenMP while at SGI, bring a depth and breadth to the book as compiler writers, application developers, and performance engineers.", acknowledgement = ack-nhfb, keywords = "parallel programming (computer science)", tableofcontents = "Foreword \\ Preface \\ 1: Introduction \\ Performance with OpenMP \\ A first glimpse of OpenMP \\ The OpenMP parallel computer \\ Why OpenMP \\ History of OpenMP \\ Navigating the rest of the book \\ 2: Getting started with OpenMP \\ 3: Exploiting loop-level parallelism \\ Meaning of the parallel do directive \\ Controlling data sharing \\ Removing data dependences \\ Enhancing performance \\ 4: Beyond loop-level parallelism, parallel regions \\ 5: Synchronization \\ 6: Performance", } @Article{ChassindeKergommeaux:2001:PEE, author = "Jacques {Chassin de Kergommeaux} and Benhur de Oliveira Stein", title = "Paj{\'e}: An Extensible Environment for Visualizing Multi-threaded Programs Executions", journal = j-LECT-NOTES-COMP-SCI, volume = "1900", pages = "133--??", year = "2001", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Sat Feb 2 13:02:44 MST 2002", bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t1900.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer-ny.com/link/service/series/0558/bibs/1900/19000133.htm; http://link.springer-ny.com/link/service/series/0558/papers/1900/19000133.pdf", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @InProceedings{Christiaens:2001:JRR, author = "Mark Christiaens", title = "{JaRec}: Record\slash Replay for Multi-threaded {Java} Programs", crossref = "USENIX:2001:PJV", pages = "??--??", year = "2001", bibdate = "Tue Oct 15 17:45:19 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/java2000.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/usenix2000.bib", URL = "http://www.usenix.org/publications/library/proceedings/jvm01/JVM_wips/S07.pdf", acknowledgement = ack-nhfb, } @Article{Duncan:2001:LPD, author = "Ray Duncan and Duncan Harris and Douglas Reilly and Craig Rodrigues and Michael Birken and Paul S. Person", title = "Letters: Plug-in Desupport; Threading and the {.Net} Framework; {CORBA} Interoperability; Game Over for {Java}; Totally Wired", journal = j-DDJ, volume = "26", number = "11", pages = "10, 12", month = nov, year = "2001", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Tue Feb 12 05:21:40 MST 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.ddj.com/", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @InProceedings{Edelstein:2001:MJP, author = "Orit Edelstein and Eitan Farchi and Yarden Nir and Gil Ratsaby and Shmuel Ur", title = "Multithreaded {Java} Program Test Generation", crossref = "ACM:2001:PAJ", pages = "181--181", year = "2001", bibdate = "Mon May 06 09:31:01 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.philippsen.com/JGI2001/camerareadyabstracts/18.html; http://www.philippsen.com/JGI2001/finalpapers/18500181.ps", acknowledgement = ack-nhfb, keywords = "Java", } @Article{Elwasif:2001:AMT, author = "Wael R. Elwasif and David E. Bernholdt and James A. Kohl and G. A. Geist", title = "An Architecture for a Multi-threaded Harness Kernel", journal = j-LECT-NOTES-COMP-SCI, volume = "2131", pages = "126--??", year = "2001", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Fri Feb 1 08:13:55 MST 2002", bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t2131.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/pvm.bib", URL = "http://link.springer-ny.com/link/service/series/0558/bibs/2131/21310126.htm; http://link.springer-ny.com/link/service/series/0558/papers/2131/21310126.pdf", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Evripidou:2001:MDD, author = "Paraskevas Evripidou", title = "{$ D^3 $-Machine}: a decoupled data-driven multithreaded architecture with variable resolution support", journal = j-PARALLEL-COMPUTING, volume = "27", number = "9", pages = "1197--1225", month = aug, year = "2001", CODEN = "PACOEJ", ISSN = "0167-8191 (print), 1872-7336 (electronic)", ISSN-L = "0167-8191", bibdate = "Wed Jul 18 06:31:16 MDT 2001", bibsource = "http://www.elsevier.com/locate/issn/01678191; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.elsevier.nl/gej-ng/10/35/21/47/35/25/abstract.html; http://www.elsevier.nl/gej-ng/10/35/21/47/35/25/article.pdf", acknowledgement = ack-nhfb, fjournal = "Parallel Computing", journal-URL = "http://www.sciencedirect.com/science/journal/01678191", } @Article{Figueiredo:2001:IPH, author = "Renato J. O. Figueiredo and Jeffrey P. Bradford and Jos{\'e} A. B. Fortes", title = "Improving the Performance of Heterogeneous {DSMs} via Multithreading", journal = j-LECT-NOTES-COMP-SCI, volume = "1981", pages = "168--??", year = "2001", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Sat Feb 2 13:03:02 MST 2002", bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t1981.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer-ny.com/link/service/series/0558/bibs/1981/19810168.htm; http://link.springer-ny.com/link/service/series/0558/papers/1981/19810168.pdf", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Garber:2001:NBT, author = "Lee Garber", title = "News Briefs: Is Tech Downturn Changing Education and Employment Trends; {HTMT} Promises High-Performance Computing; Controversial Software Law [{UCITA}] Hist Resistance", journal = j-COMPUTER, volume = "34", number = "10", pages = "19--21", month = oct, year = "2001", CODEN = "CPTRB4", ISSN = "0018-9162 (print), 1558-0814 (electronic)", ISSN-L = "0018-9162", bibdate = "Fri Feb 8 07:11:46 MST 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://dlib.computer.org/co/books/co2001/pdf/rx019.pdf; http://www.computer.org/computer/co2001/rx019abs.htm", acknowledgement = ack-nhfb, fjournal = "Computer", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2", keywords = "hybrid technology multithreaded architecture (HTMT); Uniform Computer Information Transactions Act (UCITA)", } @Article{Geiselbrecht:2001:NOS, author = "Travis K. Geiselbrecht", title = "The {NewOS} Operating System", journal = j-DDJ, volume = "26", number = "12", pages = "33, 35, 38, 40, 42, 44", month = dec, year = "2001", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Tue Feb 12 05:21:41 MST 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "See correction \cite{Editors:2002:LUC}.", URL = "http://www.ddj.com/ftp/2001/2001_12/newos.txt; http://www.ddj.com/ftp/2001/2001_12/newos.zip", abstract = "NewOS is a freely available lightweight operating system written in C for platforms ranging from Intel- and AMD-based PCs to the Sega Dreamcast. Additional resources include {\tt newos.txt} (listings) and {\tt newos.zip} (source code).", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @Article{Goeschl:2001:JTT, author = "Siegfried Goeschl", title = "The {JUnit++} Testing Tool", journal = j-DDJ, volume = "26", number = "2", pages = "34, 36--38", month = feb, year = "2001", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Thu Feb 15 12:14:41 MST 2001", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.ddj.com/ftp/2001/2001_02/junitpp.txt; http://www.ddj.com/ftp/2001/2001_02/junitpp.zip", abstract = "JUnit++ is a freely available Java unit test framework that includes a test data repository, command-line arguments, and a TestRunner class that supports a built-in repetition counter and multithreading at the command line. Additional resources include junitpp.txt (listings) and junitpp.zip (source code).", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @InProceedings{Hanson:2001:UFI, author = "Richard J. Hanson and Clay P. Breshears and Henry A. Gabb", title = "Using a {Fortran} Interface to {POSIX} Threads", crossref = "Boisvert:2001:ASS", pages = "257--272", year = "2001", bibdate = "Sat Dec 29 09:54:37 2007", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Huber:2001:EFC, author = "Andreas Huber", title = "Elegant Function Call Wrappers", journal = j-CCCUJ, volume = "19", number = "5", pages = "8--??", month = may, year = "2001", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Tue May 14 18:09:31 MDT 2002", bibsource = "http://www.cuj.com/articles/2001/0105/0105toc.htm?topic=articles; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Scheduling functions for later execution is an obvious requirement in multithreaded programs. How to do that and preserve both type safety and modularity is not so obvious. The author combines an old pattern and some new template techniques to pull it off rather nicely.", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @Article{Ishihara:2001:CCP, author = "Takashi Ishihara and Tiejun Li and Eugene F. Fodor and Ronald A. Olsson", title = "A Comparison of Concurrent Programming and Cooperative Multithreading", journal = j-LECT-NOTES-COMP-SCI, volume = "1900", pages = "729--??", year = "2001", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Sat Feb 2 13:02:44 MST 2002", bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t1900.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer-ny.com/link/service/series/0558/bibs/1900/19000729.htm; http://link.springer-ny.com/link/service/series/0558/papers/1900/19000729.pdf", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Iwama:2001:ICB, author = "Chitaka Iwama and Niko Demus Barli and Shuichi Sakai and Hidehiko Tanaka", title = "Improving Conditional Branch Prediction on Speculative Multithreading Architectures", journal = j-LECT-NOTES-COMP-SCI, volume = "2150", pages = "413--??", year = "2001", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Sat Feb 2 13:05:53 MST 2002", bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t2150.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer-ny.com/link/service/series/0558/bibs/2150/21500413.htm; http://link.springer-ny.com/link/service/series/0558/papers/2150/21500413.pdf", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Iwata:2001:PMT, author = "Kazunori Iwata and Shingo Itabashi and Naohiro Ishii", title = "A Protocol for Multi-Threaded Processes with Choice in $ \pi $-Calculus", journal = j-LECT-NOTES-COMP-SCI, volume = "2074", pages = "138--??", year = "2001", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Sat Feb 2 13:04:30 MST 2002", bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t2074.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer-ny.com/link/service/series/0558/bibs/2074/20740138.htm; http://link.springer-ny.com/link/service/series/0558/papers/2074/20740138.pdf", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Kakulavarapu:2001:DLB, author = "P. Kakulavarapu and O. C. Maquelin and J. N. Amaral and G. R. Gao", title = "Dynamic Load Balancers for a Multithreaded Multiprocessor System", journal = j-PARALLEL-PROCESS-LETT, volume = "11", number = "1", pages = "169--??", month = mar, year = "2001", CODEN = "PPLTEE", ISSN = "0129-6264 (print), 1793-642X (electronic)", bibdate = "Sat Feb 23 19:27:51 MST 2002", bibsource = "http://ejournals.wspc.com.sg/ppl/ppl.shtml; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Parallel Processing Letters", journal-URL = "http://www.worldscientific.com/loi/ppl", } @Article{Kienzle:2001:CTT, author = "J{\"o}rg Kienzle and Alexander Romanovsky", title = "Combining tasking and transactions, part {II}: open multithreaded transactions", journal = j-SIGADA-LETTERS, volume = "21", number = "1", pages = "67--74", month = mar, year = "2001", CODEN = "AALEE5", ISSN = "1094-3641 (print), 1557-9476 (electronic)", ISSN-L = "1094-3641", bibdate = "Sat Aug 9 09:06:10 MDT 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGAda Ada Letters", } @Article{Kienzle:2001:IEO, author = "J{\"o}rg Kienzle and Alexander Romanovsky", title = "Implementing exceptions in open multithreaded transactions based on {Ada 95} exceptions", journal = j-SIGADA-LETTERS, volume = "21", number = "3", pages = "57--63", month = sep, year = "2001", CODEN = "AALEE5", ISSN = "1094-3641 (print), 1557-9476 (electronic)", ISSN-L = "1094-3641", bibdate = "Sat Aug 9 09:06:11 MDT 2003", bibsource = "http://www.acm.org/sigada/ada_letters/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGAda Ada Letters", } @Article{Legrand:2001:MTD, author = "Iosif {Legrand, on behalf of the MONARC Collaboration}", title = "Multi-threaded, discrete event simulation of distributed computing systems", journal = j-COMP-PHYS-COMM, volume = "140", number = "1--2", pages = "274--285", day = "15", month = oct, year = "2001", CODEN = "CPHCBZ", DOI = "https://doi.org/10.1016/S0010-4655(01)00281-8", ISSN = "0010-4655 (print), 1879-2944 (electronic)", ISSN-L = "0010-4655", bibdate = "Mon Feb 13 23:41:04 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/compphyscomm2000.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.sciencedirect.com/science/article/pii/S0010465501002818", acknowledgement = ack-nhfb, fjournal = "Computer Physics Communications", journal-URL = "http://www.sciencedirect.com/science/journal/00104655", } @Article{Lopes:2001:FGM, author = "L. Lopes and V. T. Vasconcelos and F. Silva", title = "Fine-grained multithreading with process calculi", journal = j-IEEE-TRANS-COMPUT, volume = "50", number = "8", pages = "852--862", month = aug, year = "2001", CODEN = "ITCOB4", DOI = "https://doi.org/10.1109/12.947014", ISSN = "0018-9340 (print), 1557-9956 (electronic)", ISSN-L = "0018-9340", bibdate = "Tue Jul 5 10:03:11 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2000.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=947014", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Computers", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12", } @Article{Luk:2001:TML, author = "Chi-Keung Luk", title = "Tolerating memory latency through software-controlled pre-execution in simultaneous multithreading processors", journal = j-COMP-ARCH-NEWS, volume = "29", number = "2", pages = "40--51", month = may, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @InProceedings{Manson:2001:CSM, author = "Jeremy Manson and William Pugh", title = "Core Semantics of Multithreaded {Java}", crossref = "ACM:2001:PAJ", pages = "29--38", year = "2001", bibdate = "Mon May 06 09:31:01 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.philippsen.com/JGI2001/camerareadyabstracts/42.html; http://www.philippsen.com/JGI2001/finalpapers/18500029.pdf", acknowledgement = ack-nhfb, keywords = "Java", } @Book{Mauro:2001:SIC, author = "Jim Mauro and Richard McDougall", title = "{Solaris} Internals: Core Kernel Architecture", publisher = pub-SUN-MICROSYSTEMS-PRESS, address = pub-SUN-MICROSYSTEMS-PRESS:adr, pages = "xli + 657", year = "2001", ISBN = "0-13-022496-0", ISBN-13 = "978-0-13-022496-5", LCCN = "QA76.76.O63 M37195 2001", bibdate = "Fri Apr 11 16:56:49 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/master.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/unix.bib", series = "Sun BluePrints Program", URL = "http://www.sun.com/books/catalog/mauro/index.html", acknowledgement = ack-nhfb, libnote = "Not in my library.", shorttableofcontents = "The Solaris Memory System \\ Threads, Processes, and IPC \\ Files and File Systems \\ Kernel Tunables, Switches, and Limits \\ Kernel Virtual Address Maps", tableofcontents = "List of Header Files \\ Part 1: Introduction to Solaris Internals \\ 1: An Introduction to Solaris \\ 2: Kernel Services \\ 3: Kernel Synchronization Primitives \\ 4: Kernel Bootstrap and Initialization \\ Part 2: The Solaris Memory System \\ 5: Solaris Memory Architecture \\ 6: Kernel Memory \\ 7: Memory Monitoring \\ Part 3: Threads, Processes, and IPC \\ 8: The Solaris Multithreaded Process Architecture \\ 9: The Solaris Kernel dispatcher \\ 10: Interprocess Communication \\ Part 4: Files and File Systems \\ 11: Solaris Files and File I/O \\ 12: File System Overview \\ 13: File System Framework \\ 14: The UNIX File System \\ 15: Solaris File System Cache \\ Appendix A: Kernel Tunables, Switches, and Limits \\ Appendix B: Kernel Virtual Address Maps \\ Appendix C: A Sample Profs Utility", } @Article{Nagle:2001:MFV, author = "Dan Nagle", title = "Multithreading, {Fthreads}, and {Visual Fortran}", journal = j-DDJ, volume = "26", number = "7", pages = "36, 38, 40", month = jul, year = "2001", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Thu Jun 7 06:07:17 MDT 2001", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.ddj.com/ftp/2001/2001_07/fthreads.zip", abstract = "Dan presents a Fortran module that helps you write multithreaded programs for Windows-based applications. Additional resources include fthreads.zip (source code).", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @Article{Nakhimovsky:2001:ISM, author = "Greg Nakhimovsky", title = "Improving Scalability Of Multithreaded Dynamic Memory Allocation", journal = j-DDJ, volume = "26", number = "7", pages = "44, 46, 48--50, 52, 54", month = jul, year = "2001", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Thu Jun 7 06:07:17 MDT 2001", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.ddj.com/ftp/2001/2001_07/mthot.txt; http://www.ddj.com/ftp/2001/2001_07/mthot.zip", abstract = "Multiprocessor/multithreaded environments add a new dimension to the familiar malloc facility. The ``MT-hot'' implementation Greg presents here lets multiple threads execute in parallel without major delays. Additional resources include mthot.txt (listings) and mthot.zip (source code).", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @Article{Nikolopoulos:2001:EMA, author = "D. S. Nikolopoulos and E. Artiaga and E. Ayguad{\'e} and J. Labarta", title = "Exploiting memory affinity in {OpenMP} through schedule reuse", journal = j-COMP-ARCH-NEWS, volume = "29", number = "5", pages = "49--55", month = dec, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Ozer:2001:WMT, author = "Emre {\"O}zer and Thomas M. Conte and Saurabh Sharma", title = "Weld: a Multithreading Technique Towards Latency-Tolerant {VLIW} Processors", journal = j-LECT-NOTES-COMP-SCI, volume = "2228", pages = "192--??", year = "2001", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Sat Feb 2 13:07:14 MST 2002", bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t2228.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer-ny.com/link/service/series/0558/bibs/2228/22280192.htm; http://link.springer-ny.com/link/service/series/0558/papers/2228/22280192.pdf", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @InProceedings{Pang:2001:PSR, author = "James Pang and Gholamali Shoja and Eric Manning", title = "Providing Soft Real-time {QoS} Guarantees for {Java} Threads", crossref = "ACM:2001:PAJ", pages = "39--46", year = "2001", bibdate = "Mon May 06 09:31:01 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.philippsen.com/JGI2001/camerareadyabstracts/21.html; http://www.philippsen.com/JGI2001/finalpapers/18500039.pdf", acknowledgement = ack-nhfb, keywords = "Java", } @Article{Parcerisa:2001:ILT, author = "J.-M. Parcerisa and A. Gonzalez", title = "Improving latency tolerance of multithreading through decoupling", journal = j-IEEE-TRANS-COMPUT, volume = "50", number = "10", pages = "1084--1094", month = oct, year = "2001", CODEN = "ITCOB4", DOI = "https://doi.org/10.1109/12.956093", ISSN = "0018-9340 (print), 1557-9956 (electronic)", ISSN-L = "0018-9340", bibdate = "Tue Jul 5 10:03:12 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2000.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=956093", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Computers", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12", } @Article{Plakal:2001:CGC, author = "Manoj Plakal and Charles N. Fischer", title = "Concurrent Garbage Collection Using Program Slices on Multithreaded Processors", journal = j-SIGPLAN, volume = "36", number = "1", pages = "94--100", month = jan, year = "2001", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:18:22 MST 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "ACM SIGPLAN International Symposium on Memory Management (ISMM'00)", } @Article{Protopopov:2001:MMP, author = "Boris V. Protopopov and Anthony Skjellum", title = "A Multithreaded {Message Passing Interface (MPI)} Architecture: Performance and Program Issues", journal = j-J-PAR-DIST-COMP, volume = "61", number = "4", pages = "449--466", day = "1", month = apr, year = "2001", CODEN = "JPDCER", DOI = "https://doi.org/10.1006/jpdc.2000.1674", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Fri Feb 22 15:30:36 MST 2002", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; https://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/pvm.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.2000.1674; http://www.idealibrary.com/links/doi/10.1006/jpdc.2000.1674/pdf; http://www.idealibrary.com/links/doi/10.1006/jpdc.2000.1674/ref", acknowledgement = ack-nhfb, fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315", } @Article{Pyarali:2001:EOT, author = "Irfan Pyarali and Marina Spivak and Ron Cytron and Douglas C. Schmidt", title = "Evaluating and Optimizing Thread Pool Strategies for Real-Time {CORBA}", journal = j-SIGPLAN, volume = "36", number = "8", pages = "214--222", month = aug, year = "2001", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:18:29 MST 2003", bibsource = "http://www.cs.wisc.edu/~bodik/om2001/program.html; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "", acknowledgement = ack-nhfb, annote = "OM'01: The First Workshop on Optimization of Middleware and Distributed Systems", fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Reilly:2001:TNF, author = "Douglas Reilly", title = "Threading and the {.Net} Framework", journal = j-DDJ, volume = "26", number = "8", pages = "30, 32--33, 36, 38", month = aug, year = "2001", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Wed Jul 11 06:31:35 MDT 2001", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.ddj.com/ftp/2001/2001_08/thrednet.txt", abstract = "Microsoft's .NET Framework offers a number of features, such as threading, that simplify difficult tasks. Additional resources include thrednet.txt (listings).", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @Article{Rinard:2001:AMP, author = "Martin Rinard", title = "Analysis of Multithreaded Programs", journal = j-LECT-NOTES-COMP-SCI, volume = "2126", pages = "1--??", year = "2001", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Sat Feb 2 13:05:28 MST 2002", bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t2126.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer-ny.com/link/service/series/0558/bibs/2126/21260001.htm; http://link.springer-ny.com/link/service/series/0558/papers/2126/21260001.pdf", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Roh:2001:RMD, author = "Lucas Roh and Bhanu Shankar and Wim B{\"o}hm and Walid Najjar", title = "Resource Management in Dataflow-Based Multithreaded Execution", journal = j-J-PAR-DIST-COMP, volume = "61", number = "5", pages = "581--608", day = "1", month = may, year = "2001", CODEN = "JPDCER", DOI = "https://doi.org/10.1006/jpdc.2001.1708", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Fri Feb 22 15:30:37 MST 2002", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; https://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.2001.1708; http://www.idealibrary.com/links/doi/10.1006/jpdc.2001.1708/pdf; http://www.idealibrary.com/links/doi/10.1006/jpdc.2001.1708/ref", acknowledgement = ack-nhfb, fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315", } @Article{Salcianu:2001:PEA, author = "Alexandru Salcianu and Martin Rinard", title = "Pointer and escape analysis for multithreaded programs", journal = j-SIGPLAN, volume = "36", number = "7", pages = "12--23", month = jul, year = "2001", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:18:28 MST 2003", bibsource = "http://www.acm.org/pubs/contents/proceedings/series/ppopp/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org/pubs/articles/proceedings/ppopp/379539/p12-salcianu/p12-salcianu.pdf; http://www.acm.org/pubs/citations/proceedings/ppopp/379539/p12-salcianu/", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Sigmund:2001:SCS, author = "U. Sigmund and T. Ungerer", title = "On Speculation Control in Simultaneous Multithreaded Processors", journal = j-J-UCS, volume = "7", number = "9", pages = "848--868", day = "28", month = sep, year = "2001", CODEN = "????", ISSN = "0948-695X (print), 0948-6968 (electronic)", ISSN-L = "0948-6968", bibdate = "Wed Feb 20 07:23:10 MST 2002", bibsource = "http://www.jucs.org/jucs; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.jucs.org/jucs_7_9/on_speculation_control_in", acknowledgement = ack-nhfb, fjournal = "J.UCS: Journal of Universal Computer Science", journal-URL = "http://www.jucs.org/jucs", } @Article{Smith:2001:CMM, author = "Burton Smith", title = "{Cray MTA}: Multithreading for Latency Response", journal = j-COMPUTER, volume = "34", number = "4", pages = "69--69", month = apr, year = "2001", CODEN = "CPTRB4", ISSN = "0018-9162 (print), 1558-0814 (electronic)", ISSN-L = "0018-9162", bibdate = "Sat Apr 7 07:21:35 MDT 2001", bibsource = "https://www.math.utah.edu/pub/tex/bib/computer2000.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://dlib.computer.org/co/books/co2001/pdf/r4059.pdf; http://www.computer.org/computer/co2001/r4059abs.htm", acknowledgement = ack-nhfb, annote = "Describes the Cray MTA system, which has up to 256 multithreaded processors. There are no data caches: instead, each processor switches context every cycle among up to 128 instruction streams, and each stream can have up to eight outstanding memory references, so memory latency up to 1024 cycles does not delay processing.", fjournal = "Computer", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2", } @Article{Sohi:2001:SMP, author = "Gurindar S. Sohi and Amir Roth", title = "Speculative Multithreaded Processors", journal = j-COMPUTER, volume = "34", number = "4", pages = "66--73", month = apr, year = "2001", CODEN = "CPTRB4", ISSN = "0018-9162 (print), 1558-0814 (electronic)", ISSN-L = "0018-9162", bibdate = "Sat Apr 7 07:21:35 MDT 2001", bibsource = "https://www.math.utah.edu/pub/tex/bib/computer2000.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "See errata \cite{Anonymous:2001:ESM}.", URL = "http://dlib.computer.org/co/books/co2001/pdf/r4066.pdf; http://www.computer.org/computer/co2001/r4066abs.htm", acknowledgement = ack-nhfb, fjournal = "Computer", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2", } @Article{Sohn:2001:CTC, author = "Andrew Sohn and Yuetsu Kodama and Jui-Yuan Ku and Mitsuhisa Sato and Yoshinori Yamaguchi", title = "Chapter 15. {Tolerating} Communication Latency through Dynamic Thread Invocation in a Multithreaded Architecture", journal = j-LECT-NOTES-COMP-SCI, volume = "1808", pages = "525--??", year = "2001", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Sat Feb 2 13:02:34 MST 2002", bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t1808.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer-ny.com/link/service/series/0558/bibs/1808/18080525.htm; http://link.springer-ny.com/link/service/series/0558/papers/1808/18080525.pdf", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Steensgaard:2001:TSH, author = "Bjarne Steensgaard", title = "Thread-Specific Heaps for Multi-Threaded Programs", journal = j-SIGPLAN, volume = "36", number = "1", pages = "18--24", month = jan, year = "2001", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:18:22 MST 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "ACM SIGPLAN International Symposium on Memory Management (ISMM'00)", } @Article{Sung:2001:MDA, author = "Michael Sung and Ronny Krashinsky and Krste Asanovi{\'c}", title = "Multithreading decoupled architectures for complexity-effective general purpose computing", journal = j-COMP-ARCH-NEWS, volume = "29", number = "5", pages = "56--61", month = dec, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Theobald:2001:DCI, author = "Kevin B. Theobald and Rishi Kumar and Gagan Agrawal and Gerd Heber and Ruppa K. Thulasiram and Guang R. Gao", title = "Developing a Communication Intensive Application on the {EARTH} Multithreaded Architecture (Distinguished Paper)", journal = j-LECT-NOTES-COMP-SCI, volume = "1900", pages = "625--??", year = "2001", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Sat Feb 2 13:02:44 MST 2002", bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t1900.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer-ny.com/link/service/series/0558/bibs/1900/19000625.htm; http://link.springer-ny.com/link/service/series/0558/papers/1900/19000625.pdf", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Zoppetti:2001:IDD, author = "Gary Zoppetti and Gagan Agrawal and Rishi Kumar", title = "Impact of Data Distribution on Performance of Irregular Reductions on Multithreaded Architectures", journal = j-LECT-NOTES-COMP-SCI, volume = "2110", pages = "483--??", year = "2001", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Sat Feb 2 13:05:11 MST 2002", bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t2110.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer-ny.com/link/service/series/0558/bibs/2110/21100483.htm; http://link.springer-ny.com/link/service/series/0558/papers/2110/21100483.pdf", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Abraham-Mumm:2002:VJR, author = "Erika {\'A}brah{\'a}m-Mumm and Frank S. de Boer and Willem-Paul de Roever and Martin Steffen", title = "Verification for {Java}'s Reentrant Multithreading Concept", journal = j-LECT-NOTES-COMP-SCI, volume = "2303", pages = "5--??", year = "2002", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Tue Sep 10 19:09:21 MDT 2002", bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t2303.htm; https://www.math.utah.edu/pub/tex/bib/java2000.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer-ny.com/link/service/series/0558/bibs/2303/23030005.htm; http://link.springer-ny.com/link/service/series/0558/papers/2303/23030005.pdf", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Adiletta:2002:NGI, author = "Matthew Adiletta and Mark Rosenbluth and Debra Bernstein and Gilbert Wolrich and Hugh Wilkinson", title = "The Next Generation of {Intel IXP} Network Processors", journal = j-INTEL-TECH-J, volume = "6", number = "3", pages = "6--18", day = "15", month = aug, year = "2002", ISSN = "1535-766X", bibdate = "Sun Nov 17 11:06:06 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://developer.intel.com/technology/itj/2002/volume06issue03/art01_nextgenixp/p01_abstract.htm; http://developer.intel.com/technology/itj/2002/volume06issue03/art01_nextgenixp/vol6iss3_art01.pdf", keywords = "10Gb/s; ATM; communication architecture; Ethernet; IXP; microprocessor architecture; multi-processors; multi-service switches; multi-threading; network processors; OC-192; OC-48; routing; switching", } @Article{Adiletta:2002:PSA, author = "Matthew Adiletta and Donald Hooper and Myles Wilde", title = "Packet over {SONET}: Achieving 10 {Gigabit}/sec Packet Processing with an {IXP2800}", journal = j-INTEL-TECH-J, volume = "6", number = "3", pages = "29--39", day = "15", month = aug, year = "2002", ISSN = "1535-766X", bibdate = "Sun Nov 17 11:06:06 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://developer.intel.com/technology/itj/2002/volume06issue03/art05_packetoversonet/p01_abstract.htm; http://developer.intel.com/technology/itj/2002/volume06issue03/art05_packetoversonet/vol6iss3_art05.pdf", keywords = "10Gbs; ATM; communication architecture; Ethernet; hardware-based multi-threading; IXP; microprocessor architecture; multi-processors; multi-service switches; network processors; OC-192; OC-48; routing; switching", } @Article{Anonymous:2002:ST, author = "Anonymous", title = "Speculative threads", journal = j-COMP-ARCH-NEWS, volume = "30", number = "5", pages = "??--??", month = dec, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:23 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Antoniu:2002:IMP, author = "Gabriel Antoniu and Luc Boug{\'e}", title = "Implementing Multithreaded Protocols for Release Consistency on Top of the Generic {DSM}-{PM} Platform", journal = j-LECT-NOTES-COMP-SCI, volume = "2326", pages = "179--??", year = "2002", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Tue Sep 10 19:09:32 MDT 2002", bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t2326.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer-ny.com/link/service/series/0558/bibs/2326/23260179.htm; http://link.springer-ny.com/link/service/series/0558/papers/2326/23260179.pdf", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @InProceedings{Baldwin:2002:LMF, author = "John H. Baldwin", title = "Locking in the Multithreaded {FreeBSD} Kernel", crossref = "USENIX:2002:PBF", pages = "27--35", year = "2002", bibdate = "Tue Oct 15 12:37:27 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/usenix2000.bib", URL = "http://www.usenix.org/publications/library/proceedings/bsdcon02/baldwin.html", acknowledgement = ack-nhfb, } @Article{Balis:2002:CPM, author = "B. Balis and M. Bubak and W. Funika and R. Wism{\"u}ller", title = "A Concept of Portable Monitoring of Multithreaded Programs", journal = j-LECT-NOTES-COMP-SCI, volume = "2330", pages = "884--??", year = "2002", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Tue Sep 10 19:09:35 MDT 2002", bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t2330.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer-ny.com/link/service/series/0558/bibs/2330/23300884.htm; http://link.springer-ny.com/link/service/series/0558/papers/2330/23300884.pdf", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Boudol:2002:NCP, author = "G{\'e}rard Boudol and Ilaria Castellani", title = "Noninterference for concurrent programs and thread systems", journal = j-THEOR-COMP-SCI, volume = "281", number = "1-2", pages = "109--130", month = may, year = "2002", CODEN = "TCSCDI", ISSN = "0304-3975 (print), 1879-2294 (electronic)", ISSN-L = "0304-3975", bibdate = "Wed Nov 20 18:08:56 MST 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Theoretical Computer Science", journal-URL = "http://www.sciencedirect.com/science/journal/03043975", } @Article{Bouge:2002:IRE, author = "L. Boug{\'e} and V. Danjean and R. Namyst", title = "Improving Reactivity to {I/O} Events in Multithreaded Environments Using a Uniform, Scheduler-Centric {API}", journal = j-LECT-NOTES-COMP-SCI, volume = "2400", pages = "605--??", year = "2002", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Thu Sep 12 08:40:04 2002", bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t2400.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer-ny.com/link/service/series/0558/bibs/2400/24000605.htm; http://link.springer-ny.com/link/service/series/0558/papers/2400/24000605.pdf", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Brebner:2002:MLC, author = "Gordon Brebner", title = "Multithreading for Logic-Centric Systems", journal = j-LECT-NOTES-COMP-SCI, volume = "2438", pages = "5--??", year = "2002", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Tue Sep 10 19:10:28 MDT 2002", bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t2438.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer-ny.com/link/service/series/0558/bibs/2438/24380005.htm; http://link.springer-ny.com/link/service/series/0558/papers/2438/24380005.pdf", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @PhdThesis{Callaway:2002:VTR, author = "John Callaway", title = "Visualization of threads in a running {Java} program", type = "Thesis ({M.S.})", school = "University of California, Santa Cruz", address = "Santa Cruz, CA, USA", year = "2002", LCCN = "QA76.73.J38 C36 2002", bibdate = "Tue May 6 05:26:58 MDT 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "academic dissertations -- University of California, Santa Cruz -- 2002; academic dissertations -- University of California, Santa Cruz -- computer; computer science; computer software -- development; Java (computer program language); object-oriented programming (computer science); science; software engineering; visualization", } @Article{Carothers:2002:CMP, author = "Christopher D. Carothers and Boleslaw K. Szymanski", title = "Checkpointing Multithreaded Programs", journal = j-DDJ, volume = "27", number = "8", pages = "??--??", month = aug, year = "2002", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Fri Sep 13 06:15:52 MDT 2002", bibsource = "http://www.ddj.com/articles/2002/0208/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.ddj.com/ftp/2002/2002_08/checkpt.txt", abstract = "Checkpointing is the process by which you grab snapshots of running programs. Additional resources include checkpt.txt (listings).", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @Article{Cazals:2002:NID, author = "Fr{\'e}d{\'e}ric Cazals", title = "Non-Intrusive Debugging and Incremental Visualization with the Geometric Stethoscope", journal = j-J-GRAPHICS-TOOLS, volume = "7", number = "2", pages = "27--40", year = "2002", CODEN = "JGTOFD", ISSN = "1086-7651", bibdate = "Tue Dec 16 13:47:48 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org/jgt/papers/Cazals02/", abstract = "Developing and debugging geometric applications is known to be a difficult task: The calculations and data structures can be involved, there are degenerate cases and numerical issues, etc. THis paper presents a software setup aiming at easing the development, the debugging, ad the maintenance of geometric applications. \par More precisely, {\em incremental visualization\/} is defined as the possibility for the programmer to visualize interactively any significant update of the geometric data structures at any time. {\em Non-intrusive debugging\/} is defined as the possibility of visualizing any geometric entity in three dimensions from a standard debugger at any time without modifying the source code. We present a setup to perform incremental visualization and non-intrusive debugging. This setup is based on multithreading and requires a three-dimensional viewer, such as Open Inventor, Vtk, or Geomview, and a standard debugger (dbx or gdb). \par An Open Inventor based C++ implementation of this setup accompanies this paper. Using it simply requires writing the functions converting the user's data structures into Open Inventor's data structures. The setup could easily be extended to accommodate other medias such as sound, video, etc.", acknowledgement = ack-nhfb, fjournal = "Journal of Graphics Tools: JGT", journal-URL = "http://www.tandfonline.com/loi/ujgt20", } @Article{Chappell:2002:DPB, author = "Robert S. Chappell and Francis Tseng and Adi Yoaz and Yale N. Patt", title = "Difficult-path branch prediction using subordinate microthreads", journal = j-COMP-ARCH-NEWS, volume = "30", number = "2", pages = "307--317", month = may, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Chaudhry:2002:PTS, author = "Puneesh Chaudhry", title = "A Per-Thread Singleton Class", journal = j-CCCUJ, volume = "20", number = "5", pages = "14--??", month = may, year = "2002", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Tue May 14 18:09:36 MDT 2002", bibsource = "http://www.cuj.com/articles/2002/0205/0205toc.htm?topic=articles; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "A refreshing look at an old pattern.", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @Article{Choi:2002:EPD, author = "Jong-Deok Choi and Keunwoo Lee and Alexey Loginov and Robert O'Callahan and Vivek Sarkar and Manu Sridharan", title = "Efficient and precise datarace detection for multithreaded object-oriented programs", journal = j-SIGPLAN, volume = "37", number = "5", pages = "258--269", month = may, year = "2002", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Thu May 15 12:23:02 MDT 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Choi:2002:IFI, author = "Jong-Deok Choi and Andreas Zeller", title = "Isolating failure-inducing thread schedules", journal = j-SIGSOFT, volume = "27", number = "4", pages = "210--220", month = jul, year = "2002", CODEN = "SFENDP", DOI = "https://doi.org/10.1145/566171.566211", ISSN = "0163-5948 (print), 1943-5843 (electronic)", ISSN-L = "0163-5948", bibdate = "Wed Aug 1 17:14:20 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigsoft2000.bib", abstract = "Consider a multi-threaded application that occasionally fails due to non-determinism. Using the DEJAVU capture/replay tool, it is possible to record the thread schedule and replay the application in a deterministic way. By systematically narrowing down the difference between a thread schedule that makes the program pass and another schedule that makes the program fail, the Delta Debugging approach can pinpoint the error location automatically---namely, the location(s) where a thread switch causes the program to fail. In a case study, Delta Debugging isolated the failure-inducing schedule difference from 3.8 billion differences in only 50 tests.", acknowledgement = ack-nhfb, fjournal = "ACM SIGSOFT Software Engineering Notes", journal-URL = "https://dl.acm.org/citation.cfm?id=J728", } @Article{Clark:2002:AMT, author = "Keith Clark and Peter J. Robinson", title = "Agents as Multi-threaded Logical Objects", journal = j-LECT-NOTES-COMP-SCI, volume = "2407", pages = "33--??", year = "2002", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Tue Sep 10 19:10:17 MDT 2002", bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t2407.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer-ny.com/link/service/series/0558/bibs/2407/24070033.htm; http://link.springer-ny.com/link/service/series/0558/papers/2407/24070033.pdf", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Cook:2002:REJ, author = "Jonathan J. Cook", title = "Reverse Execution of {Java} Bytecode", journal = j-COMP-J, volume = "45", number = "6", pages = "608--619", month = "????", year = "2002", CODEN = "CMPJA6", DOI = "https://doi.org/10.1093/comjnl/45.6.608", ISSN = "0010-4620 (print), 1460-2067 (electronic)", ISSN-L = "0010-4620", bibdate = "Wed Nov 6 11:21:54 MST 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/compj2000.bib; https://www.math.utah.edu/pub/tex/bib/java2000.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www3.oup.co.uk/computer_journal/hdb/Volume_45/Issue_06/", URL = "http://www3.oup.co.uk/computer_journal/hdb/Volume_45/Issue_06/450608.sgm.abs.html; http://www3.oup.co.uk/computer_journal/hdb/Volume_45/Issue_06/pdf/450608.pdf", abstract = "We demonstrate a model, including operational semantics, for the reverse execution of stack-based code. We discuss our modification of the Kaffe implementation of the Java Virtual Machine, supporting a debugger capable of running Java bytecode backwards. We achieve reverse execution by logging the state lost during each operation or by directly reversing instructions. Our debugger has facilities for stepping, stepping over methods and running to breakpoints, in both directions. Multi-threading is supported. It is also possible to step through the bytecode when the Java source code is not available. The debugger has both a command line user interface and a graphical user interface with facilities for editing code and running the Java compiler.", acknowledgement = ack-nhfb, fjournal = "The Computer Journal", journal-URL = "http://comjnl.oxfordjournals.org/", } @Article{Delzanno:2002:TAV, author = "Giorgio Delzanno and Jean-Fran{\c{c}}ois Raskin and Laurent {Van Begin}", title = "Towards the Automated Verification of Multithreaded {Java} Programs", journal = j-LECT-NOTES-COMP-SCI, volume = "2280", pages = "173--??", year = "2002", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Tue Sep 10 19:09:09 MDT 2002", bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t2280.htm; https://www.math.utah.edu/pub/tex/bib/java2000.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer-ny.com/link/service/series/0558/bibs/2280/22800173.htm; http://link.springer-ny.com/link/service/series/0558/papers/2280/22800173.pdf", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @InProceedings{Ding:2002:MOP, author = "Yun He and Chris H. Q. Ding", key = "multidimensional arrays; index reshuffle; vacancy tracking cycles; global exchange; dynamical remapping; MPI; OpenMP; hybrid MPI/OpenMP; SMP cluster.", title = "{MPI} and {OpenMP} Paradigms on Cluster of {SMP} Architectures", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.sc-2002.org/paperpdfs/pap.pap325.pdf", abstract = "We investigate remapping multi-dimensional arrays on cluster of SMP architectures under OpenMP, MPI, and hybrid paradigms. Traditional method of array transpose needs an auxiliary array of the same size and a copy back stage. We recently developed an in-place method using vacancy tracking cycles. The vacancy tracking algorithm outperforms the traditional 2-array method as demonstrated by extensive comparisons. The independence of vacancy tracking cycles allows efficient parallelization of the in-place method on SMP architectures at node level. Performance of multi-threaded parallelism using OpenMP are tested with different scheduling methods and different number of threads. The vacancy tracking method is parallelized using several parallel paradigms. At node level, pure OpenMP outperforms pure MPI by a factor of 2.76. Across entire cluster of SMP nodes, the hybrid MPI/OpenMP implementation outperforms pure MPI by a factor of 4.44, demonstrating the validity of the parallel paradigm of mixing MPI with OpenMP.", acknowledgement = ack-nhfb, } @Article{Donnelly:2002:LTT, author = "Austin Donnelly", title = "Lightweight Thread Tunnelling in Network Applications", journal = j-LECT-NOTES-COMP-SCI, volume = "2546", pages = "48--??", year = "2002", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Sat Nov 30 20:58:13 MST 2002", bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t2546.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer.de/link/service/series/0558/bibs/2546/25460048.htm; http://link.springer.de/link/service/series/0558/papers/2546/25460048.pdf", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Edelstein:2002:MJP, author = "O. Edelstein and E. Farchi and Y. Nir and G. Ratsaby and S. Ur", title = "Multithreaded {Java} program test generation", journal = j-IBM-SYS-J, volume = "41", number = "1", pages = "111--125", month = "????", year = "2002", CODEN = "IBMSA7", ISSN = "0018-8670", bibdate = "Tue Feb 12 17:23:05 MST 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.research.ibm.com/journal/", URL = "http://www.research.ibm.com/journal/sj/411/edelstein.html; http://www.research.ibm.com/journal/sj/411/edelstein.pdf", acknowledgement = ack-nhfb, fjournal = "IBM Systems Journal", ordernumber = "G321-0144", } @Article{Editors:2002:LUC, author = "{The Editors} and Kim Reidar Lantz and Ze'ev Atlas and Pete Nelson and Gus J. Grubba", title = "Letters: {URL} Correction [``{The NewOS Operating System}'']; Passing Context to Threads; Compiling {Perl\slash Tk} Scripts; Standing by {Al}'s Principles; Understanding Photomosaics", journal = j-DDJ, volume = "27", number = "1", pages = "10, 12", month = jan, year = "2002", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Tue Feb 12 05:21:41 MST 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "See \cite{Geiselbrecht:2001:NOS}.", URL = "http://www.ddj.com/", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @InProceedings{El-Ghazawi:2002:UPP, author = "Tarek El-Ghazawi and Fran{\c{c}}ois Cantonnet", title = "{UPC} Performance and Potential: a {NPB} Experimental Study", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.sc-2002.org/paperpdfs/pap.pap316.pdf", abstract = "UPC, or Unified Parallel C, is a parallel extension of ANSI C. UPC follows a distributed shared memory programming model aimed at leveraging the ease of programming of the shared memory paradigm, while enabling the exploitation of data locality. UPC incorporates constructs that allow placing data near the threads that manipulate them to minimize remote accesses. This paper gives an overview of the concepts and features of UPC and establishes, through extensive performance measurements of NPB workloads, the viability of the UPC programming language compared to the other popular paradigms. Further, through performance measurements we identify the challenges, the remaining steps and the priorities for UPC. It will be shown that with proper hand tuning libraries, UPC performance will be comparable incorporating such improvements into automatic compare quite favorably to message passing in ease and optimized collective operations to that of MPI. Furthermore, by compiler optimizations, UPC will of programming.", acknowledgement = ack-nhfb, keywords = "NPB (NAS Parallel Benchmark)", } @Article{Feuerstein:2002:LMT, author = "E. Feuerstein and A. Strejilevich de Loma", title = "On-Line Multi-Threaded Paging", journal = j-ALGORITHMICA, volume = "32", number = "1", pages = "36--60", month = jan, year = "2002", CODEN = "ALGOEJ", DOI = "https://doi.org/10.1007/s00453-001-0073-z", ISSN = "0178-4617 (print), 1432-0541 (electronic)", ISSN-L = "0178-4617", MRclass = "68N25 (68Q10 68W05)", MRnumber = "MR1867023 (2002h:68033)", bibdate = "Fri Jan 6 11:38:14 MST 2006", bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0178-4617&volume=32&issue=1; https://www.math.utah.edu/pub/tex/bib/index-table-a.html#algorithmica; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; MathSciNet database", URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0178-4617&volume=32&issue=1&spage=36", acknowledgement = ack-nhfb, fjournal = "Algorithmica", journal-URL = "http://link.springer.com/journal/453", } @Article{Flanagan:2002:MCM, author = "Cormac Flanagan and Shaz Qadeer and Sanjit A. Seshia", title = "A Modular Checker for Multithreaded Programs", journal = j-LECT-NOTES-COMP-SCI, volume = "2404", pages = "180--??", year = "2002", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Sat Nov 30 20:57:05 MST 2002", bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t2404.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer.de/link/service/series/0558/bibs/2404/24040180.htm; http://link.springer.de/link/service/series/0558/papers/2404/24040180.pdf", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Book{Garg:2002:TOA, author = "Rajat P. Garg and Ilya Sharapov", title = "Techniques for optimizing applications: high performance computing", publisher = pub-SUN-MICROSYSTEMS-PRESS, address = pub-SUN-MICROSYSTEMS-PRESS:adr, pages = "xliii + 616", year = "2002", ISBN = "0-13-093476-3", ISBN-13 = "978-0-13-093476-5", LCCN = "QA76.88 .G37 2002", bibdate = "Fri Apr 11 08:26:42 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.sun.com/blueprints/", series = "Sun BluePrints Program", URL = "http://www.sun.com/books/catalog/garg.html/index.html", acknowledgement = ack-nhfb, annote = "From the Web site: The \verb=HPC_code_examples.tar.Z= tar-file contains the source code, makefiles, and shell scripts required to compile, link, and run the example programs discussed in the book.", keywords = "Forte Developer; MPI; OpenMP; Sun ClusterTools; Sun Solaris", } @Article{Haggar:2002:JQD, author = "Peter Haggar", title = "{Java Q\&A}: Does {Java} Guarantee Thread Safety?", journal = j-DDJ, volume = "27", number = "6", pages = "91--83", month = jun, year = "2002", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Wed May 1 15:43:59 MDT 2002", bibsource = "http://www.ddj.com/articles/2002/0206/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "Comments on lack of atomic-update guarantee in Java for objects larger than 32 bits, such as {\tt long} and {\tt double}, with sample code to exhibit the failure.", URL = "http://www.ddj.com/ftp/2002/2002_06/jqa0602.txt", abstract = "Additional resources include jqa0602.txt (listings).", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @Article{Hanson:2002:AFI, author = "Richard J. Hanson and Clay P. Breshears and Henry A. Gabb", title = "{Algorithm 821}: a {Fortran} interface to {POSIX} threads", journal = j-TOMS, volume = "28", number = "3", pages = "354--371", month = sep, year = "2002", CODEN = "ACMSCU", DOI = "https://doi.org/10.1145/569147.569152", ISSN = "0098-3500 (print), 1557-7295 (electronic)", ISSN-L = "0098-3500", bibdate = "Sat Nov 9 11:16:50 MST 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Pthreads is the library of POSIX standard functions for concurrent, multithreaded programming. The POSIX standard only defines an application programming interface (API) to the C programming language, not to Fortran. Many scientific and engineering applications are written in Fortran. Also, many of these applications exhibit functional, or task-level, concurrency. They would benefit from multithreading, especially on symmetric multiprocessors (SMP). We present here an interface to that part of the Pthreads library that is compatible with standard Fortran. The contribution consists of two primary source files: a Fortran module and a collection of C wrappers to Pthreads functions. The Fortran module defines the data structures, interface and initialization routines used to manage threads. The stability and portability of the Fortran API to Pthreads is demonstrated using common mathematical computations on three different systems.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Mathematical Software (TOMS)", journal-URL = "http://dl.acm.org/pub.cfm?id=J782", } @InProceedings{Karniadakis:2002:DLP, author = "Suchuan Dong and George Em. Karniadakis", title = "Dual-Level Parallelism for Deterministic and Stochastic {CFD} Problems", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.sc-2002.org/paperpdfs/pap.pap137.pdf", abstract = "A hybrid two-level parallelism using MPI/OpenMP is implemented in the general-purpose spectral/hp element CFD code NekTar to take advantage of the hierarchical structures arising in deterministic and stochastic CFD problems. We take a coarse grain approach to shared-memory parallelism with OpenMP and employ a workload-splitting scheme that can reduce the OpenMP synchronizations to the minimum. The hybrid implementation shows good scalability with respect to both the problem size and the number of processors in case of a fixed problem size. With the same number of processors, the hybrid model with 2 (or 4) OpenMP threads per MPI process is observed to perform better than pure MPI and pure OpenMP on the NCSA SGI Origin 2000, while the pure MPI model performs the best on the IBM SP3 at SDSC and on the Compaq Alpha cluster at PSC. A key new result is that the use of threads facilitates effectively prefinement, which is crucial to adaptive discretization using high-order methods.", acknowledgement = ack-nhfb, } @Article{Kavi:2002:MMA, author = "Krishna M. Kavi and Alireza Moshtaghi and Deng-jyi Chen", title = "Modeling Multithreaded Applications Using {Petri} Nets", journal = j-INT-J-PARALLEL-PROG, volume = "30", number = "5", pages = "353--371", month = oct, year = "2002", CODEN = "IJPPE5", DOI = "https://doi.org/10.1023/A:1019917329895", ISSN = "0885-7458 (print), 1573-7640 (electronic)", ISSN-L = "0885-7458", bibdate = "Wed Jul 6 16:40:00 MDT 2005", bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=30&issue=5; http://www.kluweronline.com/issn/0885-7458; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://ipsapp009.lwwonline.com/content/getfile/4773/29/1/abstract.htm; http://ipsapp009.lwwonline.com/content/getfile/4773/29/1/fulltext.pdf; http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=30&issue=5&spage=353", acknowledgement = ack-nhfb, fjournal = "International Journal of Parallel Programming", journal-URL = "http://link.springer.com/journal/10766", } @Article{Kempf:2002:BTL, author = "Bill Kempf", title = "The {Boost.Threads} Library", journal = j-CCCUJ, volume = "20", number = "5", pages = "6--??", month = may, year = "2002", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Tue May 14 18:09:36 MDT 2002", bibsource = "http://www.cuj.com/articles/2002/0205/0205toc.htm?topic=articles; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Standard C++ threads are imminent. CUJ predicts they will derive from the Boost.Threads library, explored here by the eminent author.", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @Article{Laneve:2002:TSJ, author = "Cosimo Laneve", title = "A type system for {JVM} threads", journal = j-THEOR-COMP-SCI, volume = "290", number = "1", pages = "741--778", month = oct, year = "2002", CODEN = "TCSCDI", ISSN = "0304-3975 (print), 1879-2294 (electronic)", ISSN-L = "0304-3975", bibdate = "Wed Nov 20 18:15:29 MST 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Theoretical Computer Science", journal-URL = "http://www.sciencedirect.com/science/journal/03043975", } @Article{Leman:2002:EFT, author = "Dmitri Leman", title = "An Efficient and Flexible Tracing Technique", journal = j-CCCUJ, volume = "20", number = "4", pages = "24--??", month = apr, year = "2002", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Tue May 14 18:09:36 MDT 2002", bibsource = "http://www.cuj.com/articles/2002/0204/0204toc.htm?topic=articles; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This extensible tracing framework tames the dreaded multithreaded debugging demon.", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @Article{Mahinthakumar:2002:HMO, author = "G. Mahinthakumar and F. Saied", title = "A Hybrid {MPI-OpenMP} Implementation of an Implicit Finite-Element Code on Parallel Architectures", journal = j-IJHPCA, volume = "16", number = "4", pages = "371--393", month = "Winter", year = "2002", CODEN = "IHPCFL", ISSN = "1094-3420 (print), 1741-2846 (electronic)", ISSN-L = "1094-3420", bibdate = "Fri Nov 28 06:52:13 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, journal-URL = "http://hpc.sagepub.com/content/by/year", } @Article{Martinez:2002:SSAa, author = "Jos{\'e} F. Mart{\'\i}nez and Josep Torrellas", title = "Speculative synchronization: applying thread-level speculation to explicitly parallel applications", journal = j-COMP-ARCH-NEWS, volume = "30", number = "5", pages = "18--29", month = dec, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:23 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Martinez:2002:SSAb, author = "Jos{\'e} F. Mart{\'\i}nez and Josep Torrellas", title = "Speculative synchronization: applying thread-level speculation to explicitly parallel applications", journal = j-SIGPLAN, volume = "37", number = "10", pages = "18--29", month = oct, year = "2002", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Thu May 15 12:23:09 MDT 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Martinez:2002:SSAc, author = "Jos{\'e} F. Mart{\'\i}nez and Josep Torrellas", title = "Speculative synchronization: applying thread-level speculation to explicitly parallel applications", journal = j-OPER-SYS-REV, volume = "36", number = "5", pages = "18--29", month = dec, year = "2002", CODEN = "OSRED8", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @Article{Mauer:2002:FST, author = "Carl J. Mauer and Mark D. Hill and David A. Wood", title = "Full-system timing-first simulation", journal = j-SIGMETRICS, volume = "30", number = "1", pages = "108--116", month = jun, year = "2002", CODEN = "????", DOI = "https://doi.org/10.1145/511334.511349", ISSN = "0163-5999 (print), 1557-9484 (electronic)", ISSN-L = "0163-5999", bibdate = "Thu Jun 26 11:38:22 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Computer system designers often evaluate future design alternatives with detailed simulators that strive for {\em functional fidelity\/} (to execute relevant workloads) and {\em performance fidelity\/} (to rank design alternatives). Trends toward multi-threaded architectures, more complex micro-architectures, and richer workloads, make authoring detailed simulators increasingly difficult. To manage simulator complexity, this paper advocates decoupled simulator organizations that separate functional and performance concerns. Furthermore, we define an approach, called {\em timing-first simulation}, that uses an augmented timing simulator to execute instructions important to performance in conjunction with a functional simulator to insure correctness. This design simplifies software development, leverages existing simulators, and can model micro-architecture timing in detail. We describe the timing-first organization and our experiences implementing TFsim, a full-system multiprocessor performance simulator. TFsim models a pipelined, out-of-order micro-architecture in detail, was developed in less than one person-year, and performs competitively with previously-published simulators. TFsim's timing simulator implements dynamically common instructions (99.99\% of them), while avoiding the vast and exacting implementation efforts necessary to run unmodified commercial operating systems and workloads. Virtutech Simics, a full-system functional simulator, checks and corrects the timing simulator's execution, contributing 18-36\% to the overall run-time. TFsim's mostly correct functional implementation introduces a worst-case performance error of 4.8\% for our commercial workloads. Some additional simulator performance is gained by verifying functional correctness less often, at the cost of some additional performance error.", acknowledgement = ack-nhfb, fjournal = "ACM SIGMETRICS Performance Evaluation Review", journal-URL = "http://portal.acm.org/toc.cfm?id=J618", } @Article{Mukherjee:2002:DDE, author = "Shubhendu S. Mukherjee and Michael Kontz and Steven K. Reinhardt", title = "Detailed design and evaluation of redundant multithreading alternatives", journal = j-COMP-ARCH-NEWS, volume = "30", number = "2", pages = "99--110", month = may, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Oplinger:2002:ESRa, author = "Jeffrey Oplinger and Monica S. Lam", title = "Enhancing software reliability with speculative threads", journal = j-COMP-ARCH-NEWS, volume = "30", number = "5", pages = "184--196", month = dec, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:23 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Oplinger:2002:ESRb, author = "Jeffrey Oplinger and Monica S. Lam", title = "Enhancing software reliability with speculative threads", journal = j-SIGPLAN, volume = "37", number = "10", pages = "184--196", month = oct, year = "2002", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Thu May 15 12:23:09 MDT 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Oplinger:2002:ESRc, author = "Jeffrey Oplinger and Monica S. Lam", title = "Enhancing software reliability with speculative threads", journal = j-OPER-SYS-REV, volume = "36", number = "5", pages = "184--196", month = dec, year = "2002", CODEN = "OSRED8", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @Article{Plachetka:2002:QTS, author = "Tomas Plachetka", title = "(Quasi-) Thread-Safe {PVM} and (Quasi-) Thread-Safe {MPI} without Active Polling", journal = j-LECT-NOTES-COMP-SCI, volume = "2474", pages = "296--??", year = "2002", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Sat Nov 30 20:57:35 MST 2002", bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t2474.htm; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer.de/link/service/series/0558/bibs/2474/24740296.htm; http://link.springer.de/link/service/series/0558/papers/2474/24740296.pdf", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Sato:2002:SJL, author = "Y. Sato", title = "A Study of {Java} Language for Effective Thread Migration", journal = "Record of Electrical and Communication Engineering Conversazione Tohoku University", volume = "71", number = "1", publisher = "Tohoku Daigaku Dentsu Danwakai", pages = "597--598", year = "2002", CODEN = "????", ISSN = "0385-7719", bibdate = "Tue Dec 24 07:09:37 MST 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; Ingenta database", acknowledgement = ack-nhfb, } @Article{Shene:2002:TST, author = "Ching-Kuang Shene", title = "{ThreadMentor}: a system for teaching multithreaded programming", journal = j-SIGCSE, volume = "34", number = "3", pages = "229--229", month = sep, year = "2002", CODEN = "SIGSD3", DOI = "https://doi.org/10.1145/637610.544497", ISSN = "0097-8418 (print), 2331-3927 (electronic)", ISSN-L = "0097-8418", bibdate = "Sat Nov 17 16:56:56 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigcse2000.bib", acknowledgement = ack-nhfb, fjournal = "SIGCSE Bulletin (ACM Special Interest Group on Computer Science Education)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J688", } @Article{Snavely:2002:SJP, author = "Allan Snavely and Dean M. Tullsen and Geoff Voelker", title = "Symbiotic jobscheduling with priorities for a simultaneous multithreading processor", journal = j-SIGMETRICS, volume = "30", number = "1", pages = "66--76", month = jun, year = "2002", CODEN = "????", DOI = "https://doi.org/10.1145/511399.511343", ISSN = "0163-5999 (print), 1557-9484 (electronic)", ISSN-L = "0163-5999", bibdate = "Thu Jun 26 11:38:22 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Simultaneous Multithreading machines benefit from jobscheduling software that monitors how well coscheduled jobs share CPU resources, and coschedules jobs that interact well to make more efficient use of those resources. As a result, informed coscheduling can yield significant performance gains over naive schedulers. However, prior work on coscheduling focused on equal-priority job mixes, which is an unrealistic assumption for modern operating systems. This paper demonstrates that a scheduler for an SMT machine can both satisfy process priorities and symbiotically schedule low and high priority threads to increase system throughput. Naive priority schedulers dedicate the machine to high priority jobs to meet priority goals, and as a result decrease opportunities for increased performance from multithreading and coscheduling. More informed schedulers, however, can dynamically monitor the progress and resource utilization of jobs on the machine, and dynamically adjust the degree of multithreading to improve performance while still meeting priority goals. Using detailed simulation of an SMT architecture, we introduce and evaluate a series of five software and hardware-assisted priority schedulers. Overall, our results indicate that coscheduling priority jobs can significantly increase system throughput by as much as 40\%, and that (1) the benefit depends upon the relative priority of the coscheduled jobs, and (2) more sophisticated schedulers are more effective when the differences in priorities are greatest. We show that our priority schedulers can decrease average turnaround times for a random job mix by as much as 33\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGMETRICS Performance Evaluation Review", journal-URL = "http://portal.acm.org/toc.cfm?id=J618", keywords = "job scheduling; priorities; simultaneous multithreading", } @Article{Sodan:2002:AMA, author = "Angela C. Sodan", title = "Applications on a multithreaded architecture: a case study with {EARTH-MANNA}", journal = j-PARALLEL-COMPUTING, volume = "28", number = "1", pages = "3--33", month = jan, year = "2002", CODEN = "PACOEJ", ISSN = "0167-8191 (print), 1872-7336 (electronic)", ISSN-L = "0167-8191", bibdate = "Fri Feb 22 16:52:43 MST 2002", bibsource = "http://www.elsevier.com/locate/issn/01678191; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.elsevier.com/gej-ng/10/35/21/60/27/28/abstract.html; http://www.elsevier.nl/gej-ng/10/35/21/60/27/28/00001684.pdf", acknowledgement = ack-nhfb, fjournal = "Parallel Computing", journal-URL = "http://www.sciencedirect.com/science/journal/01678191", } @Article{Solihin:2002:UUL, author = "Yan Solihin and Jaejin Lee and Josep Torrellas", title = "Using a user-level memory thread for correlation prefetching", journal = j-COMP-ARCH-NEWS, volume = "30", number = "2", pages = "171--182", month = may, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @InProceedings{Sterling:2002:GMP, author = "Thomas L. Sterling and Hans P. Zima", title = "{Gilgamesh}: a Multithreaded Processor-In-Memory Architecture for Petaflops Computing", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.sc-2002.org/paperpdfs/pap.pap105.pdf", abstract = "Processor-in-Memory (PIM) architectures avoid the von Neumann bottleneck in conventional machines by integrating high-density DRAM and CMOS logic on the same chip. Parallel systems based on this new technology are expected to provide higher scalability, adaptability, robustness, fault tolerance and lower power consumption than current MPPs or commodity clusters. In this paper we describe the design of Gilgamesh, a PIM-based massively parallel architecture, and elements of its execution model. Gilgamesh extends existing PIM capabilities by incorporating advanced mechanisms for virtualizing tasks and data and providing adaptive resource management for load balancing and latency tolerance. The Gilgamesh execution model is based on macroservers, a middleware layer which supports object-based runtime management of data and threads allowing explicit and dynamic control of locality and load balancing. The paper concludes with a discussion of related research activities and an outlook to future work.", acknowledgement = ack-nhfb, } @Article{Stoller:2002:MCM, author = "Scott D. Stoller", title = "Model-checking multi-threaded distributed {Java} programs", journal = j-INT-J-SOFTW-TOOLS-TECHNOL-TRANSFER, volume = "4", number = "1", pages = "71--91", month = oct, year = "2002", CODEN = "????", DOI = "https://doi.org/10.1007/s10009-002-0077-2", ISSN = "1433-2779 (print), 1433-2787 (electronic)", ISSN-L = "1433-2779", bibdate = "Tue Nov 23 15:01:41 MST 2004", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "International Journal on Software Tools for Technology Transfer: STTT", } @Article{Sung:2002:CPE, author = "Minyoung Sung and Soyoung Kim and Sangsoo Park and Naehyuck Chang and Heonshik Shin", title = "Comparative performance evaluation of {Java} threads for embedded applications: {Linux Thread} vs. {Green Thread}", journal = j-INFO-PROC-LETT, volume = "84", number = "4", pages = "221--225", day = "30", month = nov, year = "2002", CODEN = "IFPLAT", ISSN = "0020-0190 (print), 1872-6119 (electronic)", ISSN-L = "0020-0190", bibdate = "Mon Jan 26 08:44:30 MST 2004", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.sciencedirect.com/science/journal/00200190", acknowledgement = ack-nhfb, fjournal = "Information Processing Letters", journal-URL = "http://www.sciencedirect.com/science/journal/00200190", } @Article{Tennberg:2002:RGO, author = "Patrick Tennberg", title = "Refactoring Global Objects in Multithreaded Applications", journal = j-CCCUJ, volume = "20", number = "5", pages = "20--??", month = may, year = "2002", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Tue May 14 18:09:36 MDT 2002", bibsource = "http://www.cuj.com/articles/2002/0205/0205toc.htm?topic=articles; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Although you may get fired for introducing any new global variables, it's too much work to rewrite old code to remove them. So make them thread-safe and stop worrying.", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @Article{Theobald:2002:IEC, author = "Kevin B. Theobald and Rishi Kumar and Gagan Agrawal and Gerd Heber and Ruppa K. Thulasiram and Guang R. Gao", title = "Implementation and evaluation of a communication intensive application on the {EARTH} multithreaded system", journal = j-CCPE, volume = "14", number = "3", pages = "183--201", month = mar, year = "2002", CODEN = "CCPEBO", DOI = "https://doi.org/10.1002/cpe.604", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Sat May 18 14:54:00 MDT 2002", bibsource = "http://www.interscience.wiley.com/jpages/1532-0626; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www3.interscience.wiley.com/journalfinder.html", URL = "http://www3.interscience.wiley.com/cgi-bin/abstract/93513486/START; http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=93513486{\&}PLACEBO=IE.pdf", acknowledgement = ack-nhfb, fjournal = "Concurrency and Computation: Prac\-tice and Experience", journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626", } @Article{Thulasiraman:2002:EMA, author = "Parimala Thulasiraman and Kevin Theobald and Ashfaq A. Khokhar and Guang R. Gao", title = "Efficent Multithreaded Algorithms for the {Fast Fourier Transform}", journal = j-PARALLEL-DIST-COMP-PRACT, volume = "5", number = "2", pages = "239--258", month = jun, year = "2002", CODEN = "????", ISSN = "1097-2803", bibdate = "Thu Sep 2 12:08:56 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.scpe.org/content/5/2.toc", acknowledgement = ack-nhfb, fjournal = "PDCP: Parallel and Distributed Computing Practices", } @Article{Ungerer:2002:MP, author = "Theo Ungerer and Borut Robi{\v{c}} and Jurij {\v{S}}ilc", title = "Multithreaded Processors", journal = j-COMP-J, volume = "45", number = "3", pages = "320--348", month = "????", year = "2002", CODEN = "CMPJA6", ISSN = "0010-4620 (print), 1460-2067 (electronic)", ISSN-L = "0010-4620", bibdate = "Fri May 10 10:12:07 MDT 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www3.oup.co.uk/computer_journal/hdb/Volume_45/Issue_03/", URL = "http://www3.oup.co.uk/computer_journal/hdb/Volume_45/Issue_03/450320.sgm.abs.html; http://www3.oup.co.uk/computer_journal/hdb/Volume_45/Issue_03/pdf/450320.pdf", acknowledgement = ack-nhfb, fjournal = "The Computer Journal", journal-URL = "http://comjnl.oxfordjournals.org/", } @Article{Ungerer:2002:SPE, author = "Theo Ungerer and Borut Robi{\v{c}} and Jurij {\v{S}}ilc", title = "A survey of processors with explicit multithreading", journal = j-COMP-SURV, volume = "35", number = "1", pages = "29--63", month = mar, year = "2002", CODEN = "CMSVAN", ISSN = "0360-0300 (print), 1557-7341 (electronic)", ISSN-L = "0360-0300", bibdate = "Thu Aug 7 06:57:01 MDT 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM Computing Surveys", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J204", } @Article{Vijaykumar:2002:TFR, author = "T. N. Vijaykumar and Irith Pomeranz and Karl Cheng", title = "Transient-fault recovery using simultaneous multithreading", journal = j-COMP-ARCH-NEWS, volume = "30", number = "2", pages = "87--98", month = may, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Wang:2002:SPE, author = "Hong Wang and Perry H. Wang and Ross Dave Weldon and Scott M. Ettinger and Hideki Saito and Milind Girkar and Steve Shih-wei Liao and John P. Shen", title = "Speculative Precomputation: Exploring the Use of Multithreading for Latency Tools", journal = j-INTEL-TECH-J, volume = "6", number = "1", pages = "22--35", month = feb, year = "2002", ISSN = "1535-766X", bibdate = "Thu Feb 28 15:24:21 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://developer.intel.com/technology/itj/2002/volume06issue01/vol6iss1_hyper_threading_technology.pdf", } @Article{Yan:2002:RCC, author = "C. Yan", title = "Race condition and concurrency safety of multithreaded object-oriented programming in {Java}", journal = "IEEE International Conference on Systems Man and Cybernetics", volume = "6", pages = "??--??", year = "2002", CODEN = "????", ISSN = "1062-922X", bibdate = "Tue Apr 8 06:53:44 MDT 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; Ingenta database", acknowledgement = ack-nhfb, xxpages = "WA1Q3", } @Article{Zhai:2002:COSa, author = "Antonia Zhai and Christopher B. Colohan and J. Gregory Steffan and Todd C. Mowry", title = "Compiler optimization of scalar value communication between speculative threads", journal = j-COMP-ARCH-NEWS, volume = "30", number = "5", pages = "171--183", month = dec, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (ACM), 0884-7495 (IEEE)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:23 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Zhai:2002:COSb, author = "Antonia Zhai and Christopher B. Colohan and J. Gregory Steffan and Todd C. Mowry", title = "Compiler optimization of scalar value communication between speculative threads", journal = j-SIGPLAN, volume = "37", number = "10", pages = "171--183", month = oct, year = "2002", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Thu May 15 12:23:09 MDT 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Zuberek:2002:APB, author = "W. M. Zuberek", title = "Analysis of Performance Bottlenecks in Multithreaded Multiprocessor Systems", journal = j-FUND-INFO, volume = "50", number = "2", pages = "223--241", month = feb, year = "2002", CODEN = "FUMAAJ", ISSN = "0169-2968 (print), 1875-8681 (electronic)", ISSN-L = "0169-2968", bibdate = "Sat Mar 5 16:59:23 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/fundinfo2000.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Fundamenta Informaticae", journal-URL = "http://content.iospress.com/journals/fundamenta-informaticae", } @Article{Aamodt:2003:FMO, author = "Tor M. Aamodt and Pedro Marcuello and Paul Chow and Antonio Gonz{\'a}lez and Per Hammarlund and Hong Wang and John P. Shen", title = "A framework for modeling and optimization of prescient instruction prefetch", journal = j-SIGMETRICS, volume = "31", number = "1", pages = "13--24", month = jun, year = "2003", CODEN = "????", DOI = "https://doi.org/10.1145/781027.781030", ISSN = "0163-5999 (print), 1557-9484 (electronic)", ISSN-L = "0163-5999", bibdate = "Thu Jun 26 11:41:41 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This paper describes a framework for modeling macroscopic program behavior and applies it to optimizing prescient instruction prefetch --- novel technique that uses helper threads to improve single-threaded application performance by performing judicious and timely instruction prefetch. A helper thread is initiated when the main thread encounters a spawn point, and prefetches instructions starting at a distant target point. The target identifies a code region tending to incur I-cache misses that the main thread is likely to execute soon, even though intervening control flow may be unpredictable. The optimization of spawn-target pair selections is formulated by modeling program behavior as a Markov chain based on profile statistics. Execution paths are considered stochastic outcomes, and aspects of program behavior are summarized via path expression mappings. Mappings for computing reaching, and posteriori probability; path length mean, and variance; and expected path footprint are presented. These are used with Tarjan's fast path algorithm to efficiently estimate the benefit of spawn-target pair selections. Using this framework we propose a spawn-target pair selection algorithm for prescient instruction prefetch. This algorithm has been implemented, and evaluated for the Itanium Processor Family architecture. A limit study finds 4.8\%to 17\% speedups on an in-order simultaneous multithreading processor with eight contexts, over nextline and streaming I-prefetch for a set of benchmarks with high I-cache miss rates. The framework in this paper is potentially applicable to other thread speculation techniques.", acknowledgement = ack-nhfb, fjournal = "ACM SIGMETRICS Performance Evaluation Review", journal-URL = "http://portal.acm.org/toc.cfm?id=J618", keywords = "analytical modeling; helper threads; instruction prefetch; multithreading; optimization; path expressions", } @Article{Abraham:2003:TSP, author = "E. Abraham and F. S. deBoer and W. P. deRoever and M. Steffen", title = "A Tool-Supported Proof System for Multithreaded {Java}", journal = j-LECT-NOTES-COMP-SCI, volume = "2852", pages = "1--32", year = "2003", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Tue Nov 11 05:21:36 MST 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/java2000.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; Ingenta database", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Addison:2003:OIA, author = "C. Addison and Y. Ren and M. van Waveren", title = "{OpenMP} issues arising in the development of parallel {BLAS} and {LAPACK} libraries", journal = j-SCI-PROG, volume = "11", number = "2", pages = "95--104", year = "2003", CODEN = "SCIPEV", ISSN = "1058-9244 (print), 1875-919X (electronic)", ISSN-L = "1058-9244", bibdate = "Mon Jan 12 06:28:15 MST 2004", bibsource = "http://www.iospress.nl/site/html/10589244.html; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Scientific Programming", journal-URL = "http://iospress.metapress.com/content/1058-9244", } @Article{Almasi:2003:DCD, author = "George Alm{\'a}si and C{\u{a}}lin Ca{\c{s}}caval and Jos{\'e} G. Casta{\~n}os and Monty Denneau and Derek Lieber and Jos{\'e} E. Moreira and Henry S. {Warren, Jr.}", title = "Dissecting {Cyclops}: a detailed analysis of a multithreaded architecture", journal = j-COMP-ARCH-NEWS, volume = "31", number = "1", pages = "26--38", month = mar, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:37 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Balis:2003:MSM, author = "Bartosz Bali{\'s} and Marian Bubak and W{\l}odzimierz Funika and Roland Wism{\"u}ller", title = "A monitoring system for multithreaded applications", journal = j-FUT-GEN-COMP-SYS, volume = "19", number = "5", pages = "641--650", month = jul, year = "2003", CODEN = "FGSEVI", ISSN = "0167-739X (print), 1872-7115 (electronic)", ISSN-L = "0167-739X", bibdate = "Sat Jan 10 10:03:34 MST 2004", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Future Generation Computer Systems", journal-URL = "http://www.sciencedirect.com/science/journal/0167739X", remark = "Tools for Program Development and Analysis. Best papers from two Technical Sessions, at ICCS2001, San Francisco, CA, USA, and ICCS2002, Amsterdam, The Netherlands.", } @Article{Barekas:2003:MAO, author = "Vasileios K. Barekas and Panagiotis E. Hadjidoukas and Eleftherios D. Polychronopoulos and others", title = "A Multiprogramming Aware {OpenMP} Implementation", journal = j-SCI-PROG, volume = "11", number = "2", pages = "133--141", year = "2003", CODEN = "SCIPEV", ISSN = "1058-9244 (print), 1875-919X (electronic)", ISSN-L = "1058-9244", bibdate = "Mon Jan 12 06:28:15 MST 2004", bibsource = "http://www.iospress.nl/site/html/10589244.html; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Scientific Programming", journal-URL = "http://iospress.metapress.com/content/1058-9244", } @Article{Brightwell:2003:DIP, author = "Ron Brightwell and Rolf Riesen and Arthur B. Maccabe", title = "Design, Implementation, and Performance of {MPI} on {Portals 3.0}", journal = j-IJHPCA, volume = "17", number = "1", pages = "7--20", month = "Spring", year = "2003", CODEN = "IHPCFL", ISSN = "1094-3420 (print), 1741-2846 (electronic)", ISSN-L = "1094-3420", bibdate = "Fri Nov 28 06:52:13 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, journal-URL = "http://hpc.sagepub.com/content/by/year", } @Article{Briguglio:2003:PPM, author = "Sergio Briguglio and Beniamino Di Martino and Gregorio Vlad", title = "A performance-prediction model for {PIC} applications on clusters of Symmetric MultiProcessors: Validation with hierarchical {HPF $+$ OpenMP} implementation", journal = j-SCI-PROG, volume = "11", number = "2", pages = "159--176", year = "2003", CODEN = "SCIPEV", ISSN = "1058-9244 (print), 1875-919X (electronic)", ISSN-L = "1058-9244", bibdate = "Mon Jan 12 06:28:15 MST 2004", bibsource = "http://www.iospress.nl/site/html/10589244.html; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Scientific Programming", journal-URL = "http://iospress.metapress.com/content/1058-9244", } @Article{Carr:2003:TPT, author = "Steve Carr and Jean Mayo and Ching-Kuang Shene", title = "{ThreadMentor}: a pedagogical tool for multithreaded programming", journal = j-JERIC, volume = "3", number = "1", pages = "1--30", month = mar, year = "2003", CODEN = "????", ISSN = "1531-4278", bibdate = "Tue Feb 3 18:43:37 MST 2004", bibsource = "http://www.acm.org/pubs/contents/journals/jeric/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM Journal on Educational Resources in Computing (JERIC)", } @InProceedings{Chakravarti:2003:ISM, author = "A. Chakravarti and X. Wang and J. Hallstrom and G. Baumgartner", booktitle = "Proceedings of the International Conference on Parallel Processing", title = "Implementation of Strong Mobility for Multi-threaded Agents in {Java}", publisher = "????", address = "????", pages = "321--332", year = "2003", CODEN = "????", ISSN = "0190-3918", bibdate = "Tue Dec 2 18:51:43 MST 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/java2000.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; Ingenta database", acknowledgement = ack-nhfb, } @Article{Chen:2003:CSS, author = "Peng-Sheng Chen and Ming-Yu Hung and Yuan-Shin Hwang and Roy Dz-Ching Ju and Jenq Kuen Lee", title = "Compiler support for speculative multithreading architecture with probabilistic points-to analysis", journal = j-SIGPLAN, pages = "25--36", year = "2003", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Mon Dec 22 16:52:42 MST 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Domani:2003:TLH, author = "Tamar Domani and Gal Goldshtein and Elliot K. Kolodner and Ethan Lewis and Erez Petrank and Dafna Sheinwald", title = "Thread-Local Heaps for {Java}", journal = j-SIGPLAN, volume = "38", number = "2s", pages = "183--194", month = feb, year = "2003", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Thu May 15 12:23:14 MDT 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; Ingenta database", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Edelstein:2003:FTM, author = "Orit Edelstein and Eitan Farchi and Evgeny Goldin and Yarden Nir and Gil Ratsaby and Shmuel Ur", title = "Framework for testing multi-threaded {Java} programs", journal = j-CCPE, volume = "15", number = "3--5", pages = "485--499", month = mar # "\slash " # apr, year = "2003", CODEN = "CCPEBO", DOI = "https://doi.org/10.1002/cpe.654", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Tue Jan 13 09:28:08 MST 2004", bibsource = "http://www.interscience.wiley.com/jpages/1532-0626; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www3.interscience.wiley.com/journalfinder.html", acknowledgement = ack-nhfb, fjournal = "Concurrency and Computation: Prac\-tice and Experience", journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626", onlinedate = "12 Feb 2003", } @Article{Fang:2003:DGO, author = "Weijian Fang and Cho-Li Wang and Francis C. M. Lau", title = "On the design of global object space for efficient multi-threading {Java} computing on clusters", journal = j-PARALLEL-COMPUTING, volume = "29", number = "11--12", pages = "1563--1587", month = nov # "\slash " # dec, year = "2003", CODEN = "PACOEJ", ISSN = "0167-8191 (print), 1872-7336 (electronic)", ISSN-L = "0167-8191", bibdate = "Wed Dec 24 09:07:29 MST 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Parallel Computing", journal-URL = "http://www.sciencedirect.com/science/journal/01678191", } @Article{Gagnon:2003:EIT, author = "E. Gagnon and L. Hendren", title = "Effective Inline-Threaded Interpretation of {Java} Bytecode Using Preparation Sequences", journal = j-LECT-NOTES-COMP-SCI, volume = "2622", pages = "170--184", year = "2003", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Tue Apr 15 07:54:18 MDT 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; Ingenta database", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Gould:2003:GLT, author = "Nicholas I. M. Gould and Dominique Orban and Philippe L. Toint", title = "{GALAHAD}, a library of thread-safe {Fortran 90} packages for large-scale nonlinear optimization", journal = j-TOMS, volume = "29", number = "4", pages = "353--372", month = dec, year = "2003", CODEN = "ACMSCU", DOI = "https://doi.org/10.1145/962437.962438", ISSN = "0098-3500 (print), 1557-7295 (electronic)", ISSN-L = "0098-3500", bibdate = "Mon Jan 5 17:18:49 MST 2004", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "We describe the design of version 1.0 of GALAHAD, a library of Fortran 90 packages for large-scale nonlinear optimization. The library particularly addresses quadratic programming problems, containing both interior point and active set algorithms, as well as tools for preprocessing problems prior to solution. It also contains an updated version of the venerable nonlinear programming package, LANCELOT.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Mathematical Software (TOMS)", journal-URL = "http://dl.acm.org/pub.cfm?id=J782", } @Article{Grossman:2003:TSM, author = "Dan Grossman", title = "Type-safe multithreading in cyclone", journal = j-SIGPLAN, volume = "38", number = "3", pages = "13--25", month = mar, year = "2003", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Thu May 15 12:23:16 MDT 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Heinlein:2003:ATS, author = "C. Heinlein", title = "Advanced Thread Synchronization in {Java} Using Interaction Expressions", journal = j-LECT-NOTES-COMP-SCI, volume = "2591", pages = "345--365", year = "2003", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Tue Apr 1 06:09:06 MST 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; Ingenta database", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Jin:2003:AMP, author = "Haoqiang Jin and Gabriele Jost and Jerry Yan and others", title = "Automatic multilevel parallelization using {OpenMP}", journal = j-SCI-PROG, volume = "11", number = "2", pages = "177--190", year = "2003", CODEN = "SCIPEV", ISSN = "1058-9244 (print), 1875-919X (electronic)", ISSN-L = "1058-9244", bibdate = "Mon Jan 12 06:28:15 MST 2004", bibsource = "http://www.iospress.nl/site/html/10589244.html; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Scientific Programming", journal-URL = "http://iospress.metapress.com/content/1058-9244", } @InProceedings{Kee:2003:POP, author = "Yang-Suk Kee and Jin-Soo Kim and Soonhoi Ha", title = "{ParADE}: An {OpenMP} Programming Environment for {SMP} Cluster Systems", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10708#0; http://www.sc-conference.org/sc2003/paperpdfs/pap130.pdf", abstract = "Demand for programming environments to exploit clusters of symmetric multiprocessors (SMPs) is increasing. In this paper, we present a new programming environment, called ParADE, to enable easy, portable, and high-performance programming on SMP clusters. It is an OpenMP programming environment on top of a multi-threaded software distributed shared memory (SDSM) system with a variant of home-based lazy release consistency protocol. To boost performance, the runtime system provides explicit message-passing primitives to make it a hybrid-programming environment. Collective communication primitives are used for the synchronization and work-sharing directives associated with small data structures, lessening the synchronization overhead and avoiding the implicit barriers of work-sharing directives. The OpenMP translator bridges the gap between the OpenMP abstraction and the hybrid programming interfaces of the runtime system. The experiments with several NAS benchmarks and applications on a Linux-based cluster show promising results that ParADE overcomes the performance problem of the conventional SDSM-based OpenMP environment.", acknowledgement = ack-nhfb, keywords = "hybrid programming; MPI; OpenMP; programming environment; SMP cluster; software distributed shared memory", } @Article{Keen:2003:CCP, author = "Aaron W. Keen and Takashi Ishihara and Justin T. Maris and Tiejun Li and Eugene F. Fodor and Ronald A. Olsson", title = "A comparison of concurrent programming and cooperative multithreading", journal = j-CCPE, volume = "15", number = "1", pages = "27--53", month = jan, year = "2003", CODEN = "CCPEBO", DOI = "https://doi.org/10.1002/cpe.706", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Tue Jan 13 09:28:05 MST 2004", bibsource = "http://www.interscience.wiley.com/jpages/1532-0626; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www3.interscience.wiley.com/journalfinder.html", acknowledgement = ack-nhfb, fjournal = "Concurrency and Computation: Prac\-tice and Experience", journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626", onlinedate = "6 Jan 2003", } @Article{Kepner:2003:MTF, author = "Jeremy Kepner", title = "A multi-threaded fast convolver for dynamically parallel image filtering", journal = j-J-PAR-DIST-COMP, volume = "63", number = "3", pages = "360--372", month = mar, year = "2003", CODEN = "JPDCER", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Tue Dec 16 16:10:40 MST 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315", } @InProceedings{Klasky:2003:GBP, author = "Scott Alan Klasky and Stephane Ethier and Zhihong Lin and Kevin Martins and Doug McCune and Ravi Samtaney", title = "Grid-Based Parallel Data Streaming implemented for the Gyrokinetic Toroidal Code", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10722#2; http://www.sc-conference.org/sc2003/paperpdfs/pap207.pdf", abstract = "We have developed a threaded parallel data streaming approach using Globus to transfer multi-terabyte simulation data from a remote supercomputer to the scientist's home analysis/visualization cluster, as the simulation executes, with negligible overhead. Data transfer experiments show that this concurrent data transfer approach is more favorable compared with writing to local disk and then transferring this data to be post-processed. The present approach is conducive to using the grid to pipeline the simulation with post-processing and visualization. We have applied this method to the Gyrokinetic Toroidal Code (GTC), a 3-dimensional particle-in-cell code used to study micro-turbulence in magnetic confinement fusion from first principles plasma theory.", acknowledgement = ack-nhfb, } @Article{Koster:2003:TTI, author = "Rainer Koster and Andrew P. Black and Jie Huang and Jonathan Walpole and Calton Pu", title = "Thread transparency in information flow middleware", journal = j-SPE, volume = "33", number = "4", pages = "321--349", month = apr, year = "2003", CODEN = "SPEXBL", DOI = "https://doi.org/10.1002/spe.510", ISSN = "0038-0644 (print), 1097-024X (electronic)", ISSN-L = "0038-0644", bibdate = "Sat Nov 29 17:39:44 MST 2003", bibsource = "http://www.interscience.wiley.com/jpages/0038-0644; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www3.interscience.wiley.com/journalfinder.html", acknowledgement = ack-nhfb, fjournal = "Software---Practice and Experience", journal-URL = "http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1097-024X", onlinedate = "19 Feb 2003", } @Article{Koufaty:2003:HTN, author = "David Koufaty and Deborah T. Marr", title = "Hyperthreading Technology in the Netburst Microarchitecture", journal = j-IEEE-MICRO, volume = "23", number = "2", pages = "56--65", month = mar # "\slash " # apr, year = "2003", CODEN = "IEMIDZ", DOI = "https://doi.org/10.1109/MM.2003.1196115", ISSN = "0272-1732 (print), 1937-4143 (electronic)", ISSN-L = "0272-1732", bibdate = "Wed Apr 23 18:57:11 MDT 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://dlib.computer.org/mi/books/mi2003/pdf/m2056.pdf; http://www.computer.org/micro/mi2003/m2056abs.htm", acknowledgement = ack-nhfb, fjournal = "IEEE Micro", journal-URL = "http://www.computer.org/csdl/mags/mi/index.html", } @Article{Kranzlmuller:2003:RAP, author = "Dieter Kranzlm{\"u}ller and Peter Kacsuk and Jack Dongarra and Jens Volkert", title = "Recent Advances in Parallel Virtual Machine and Message Passing Interface (Select papers from the {EuroPVMMPI 2002 Conference})", journal = j-IJHPCA, volume = "17", number = "1", pages = "3--5", month = "Spring", year = "2003", CODEN = "IHPCFL", ISSN = "1094-3420 (print), 1741-2846 (electronic)", ISSN-L = "1094-3420", bibdate = "Fri Nov 28 06:52:13 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, journal-URL = "http://hpc.sagepub.com/content/by/year", } @Article{Kreuzinger:2003:RTE, author = "J. Kreuzinger and U. Brinkschulte and M. Pfeffer and S. Uhrig and T. Ungerer", title = "Real-time event-handling and scheduling on a multithreaded {Java} microcontroller", journal = j-MICROPROC-MICROSYS, volume = "27", number = "1", pages = "19--31", year = "2003", CODEN = "MIMID5", ISSN = "0141-9331 (print), 1872-9436 (electronic)", ISSN-L = "0141-9331", bibdate = "Tue Feb 18 07:16:21 MST 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; Ingenta database", acknowledgement = ack-nhfb, fjournal = "Microprocessors and Microsystems", } @Article{Kwok:2003:EHC, author = "Yu-Kwong Kwok", title = "On Exploiting Heterogeneity for Cluster Based Parallel Multithreading Using Task Duplication", journal = j-J-SUPERCOMPUTING, volume = "25", number = "1", pages = "63--72", month = may, year = "2003", CODEN = "JOSUED", ISSN = "0920-8542 (print), 1573-0484 (electronic)", ISSN-L = "0920-8542", bibdate = "Tue Dec 16 08:27:09 MST 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.wkap.nl/journalhome.htm/0920-8542", URL = "http://ipsapp009.kluweronline.com/content/getfile/5189/43/4/abstract.htm; http://ipsapp009.kluweronline.com/content/getfile/5189/43/4/fulltext.pdf", acknowledgement = ack-nhfb, fjournal = "The Journal of Supercomputing", journal-URL = "http://link.springer.com/journal/11227", } @Article{Mantel:2003:UAS, author = "Heiko Mantel and Andrei Sabelfeld", title = "A unifying approach to the security of distributed and multi-threaded programs", journal = j-J-COMP-SECUR, volume = "11", number = "4", pages = "615--676", month = "????", year = "2003", CODEN = "JCSIET", DOI = "https://doi.org/10.3233/JCS-2003-11406", ISSN = "0926-227X (print), 1875-8924 (electronic)", ISSN-L = "0926-227X", bibdate = "Tue May 24 06:22:14 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/jcompsecur.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Journal of Computer Security", journal-URL = "http://content.iospress.com/journals/journal-of-computer-security", } @Article{Marowka:2003:EOT, author = "Ami Marowka", title = "Extending {OpenMP} for Task Parallelism", journal = j-PARALLEL-PROCESS-LETT, volume = "13", number = "3", pages = "341--??", month = sep, year = "2003", CODEN = "PPLTEE", ISSN = "0129-6264 (print), 1793-642X (electronic)", bibdate = "Sat Nov 6 18:06:31 MST 2004", bibsource = "http://ejournals.wspc.com.sg/ppl/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Parallel Processing Letters", journal-URL = "http://www.worldscientific.com/loi/ppl", } @Article{Mattson:2003:HGO, author = "Timothy G. Mattson", title = "How good is {OpenMP}", journal = j-SCI-PROG, volume = "11", number = "2", pages = "81--93", year = "2003", CODEN = "SCIPEV", ISSN = "1058-9244 (print), 1875-919X (electronic)", ISSN-L = "1058-9244", bibdate = "Mon Jan 12 06:28:15 MST 2004", bibsource = "http://www.iospress.nl/site/html/10589244.html; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Scientific Programming", journal-URL = "http://iospress.metapress.com/content/1058-9244", } @InProceedings{McAuley:2003:CVC, author = "Derek McAuley and Rolf Neugebauer", title = "A case for virtual channel processors", crossref = "ACM:2003:ATA", pages = "237--242", year = "2003", DOI = "https://doi.org/10.1145/944747.944758", bibdate = "Sat Oct 14 14:03:33 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", abstract = "Modern desktop and server computer systems use multiple processors: general purpose CPU(s), graphic processor (GPU), network processors (NP) on Network Interface Cards (NICs), RAID controllers, and signal processors on sound cards and modems. Some of these processors traditionally have been special purpose processors but there is a trend towards replacing some of these with embedded general purpose processors. At the same time main CPUs become more powerful; desktop CPUs start featuring Simultaneous Multi-Threading (SMT); and Symmetric Multi-Processing (SMP) systems are widely used in server systems. However, the structure of operating systems has not really changed to reflect these trends --- different types of processors evolve at different time scales (largely driven by market forces) requiring significant changes to operating systems kernels to reflect the appropriate tradeoffs.In this position paper we propose to re-vitalise the old idea of channel processors by encapsulating operating system I/O subsystems in Virtual Channel Processors (VCPs). VCPs perform I/O operations on behalf of an OS. They provide similar development, performance, and fault isolation as dedicated (embedded) I/O processors do while offering the flexibility to split functionality between the main processor(s) and dedicated processors without affecting the rest of the OS. If part of a VCP is executed on the main processor, we propose to make use of virtual machine technology and SMT/SMP features to isolate its performance from that of the rest of the system and to protect the system from faults within the VCP.", acknowledgement = ack-nhfb, } @Article{McDowell:2003:ISS, author = "Luke K. McDowell and Susan J. Eggers and Steven D. Gribble", title = "Improving server software support for simultaneous multithreaded processors", journal = j-SIGPLAN, pages = "37--48", year = "2003", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Mon Dec 22 16:52:42 MST 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Muller:2003:OCB, author = "Matthias S. M{\"u}ller", title = "An {OpenMP} compiler benchmark", journal = j-SCI-PROG, volume = "11", number = "2", pages = "125--131", year = "2003", CODEN = "SCIPEV", ISSN = "1058-9244 (print), 1875-919X (electronic)", ISSN-L = "1058-9244", bibdate = "Mon Jan 12 06:28:15 MST 2004", bibsource = "http://www.iospress.nl/site/html/10589244.html; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Scientific Programming", journal-URL = "http://iospress.metapress.com/content/1058-9244", } @InProceedings{Nakajima:2003:PIS, author = "Kengo Nakajima", title = "Parallel Iterative Solvers of {GeoFEM} with Selective Blocking Preconditioning for Nonlinear Contact Problems on the {Earth Simulator}", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10703#1; http://www.sc-conference.org/sc2003/paperpdfs/pap155.pdf", abstract = "An efficient parallel iterative method with selective blocking preconditioning has been developed for symmetric multiprocessor (SMP) cluster architectures with vector processors such as the Earth Simulator. This method is based on a three-level hybrid parallel programming model, which includes message passing for inter-SMP node communication, loop directives by OpenMP for intra-SMP node parallelization and vectorization for each processing element (PE). This method provides robust and smooth convergence and excellent vector and parallel performance in 3D geophysical simulations with contact conditions performed on the Earth Simulator. The selective blocking preconditioning is much more efficient than ILU(1) and ILU(2). Performance for the complicated Southwest Japan model with more than 23 M DOF on 10 SMP nodes (80 PEs) of the Earth Simulator was 161.7 GFLOPS, corresponding to 25.3\% of the peak performance for hybrid programming model, and 190.4 GFLOPS (29.8\% of the peak performance) for flat MPI, respectively.", acknowledgement = ack-nhfb, } @Article{Pang:2003:PSR, author = "James C. Pang and Gholamali C. Shoja and Eric G. Manning", title = "Providing soft real-time quality of service guarantees for {Java} threads", journal = j-CCPE, volume = "15", number = "3--5", pages = "521--538", month = mar # "\slash " # apr, year = "2003", CODEN = "CCPEBO", DOI = "https://doi.org/10.1002/cpe.663", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Tue Jan 13 09:28:08 MST 2004", bibsource = "http://www.interscience.wiley.com/jpages/1532-0626; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www3.interscience.wiley.com/journalfinder.html", acknowledgement = ack-nhfb, fjournal = "Concurrency and Computation: Prac\-tice and Experience", journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626", onlinedate = "12 Feb 2003", } @Article{Park:2003:IMP, author = "Il Park and Babak Falsafi and T. N. Vijaykumar", title = "Implicitly-multithreaded processors", journal = j-COMP-ARCH-NEWS, volume = "31", number = "2", pages = "39--51", month = may, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Petitpierre:2003:JTC, author = "C. Petitpierre", title = "{Java} Threads Can Be Very Useful Building Blocks", journal = j-LECT-NOTES-COMP-SCI, volume = "2604", pages = "204", year = "2003", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Tue Apr 1 06:09:06 MST 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; Ingenta database", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Pinilla:2003:UJT, author = "Ruben Pinilla and Marisa Gil", title = "{ULT}: a {Java} threads model for platform independent execution", journal = j-OPER-SYS-REV, volume = "37", number = "4", pages = "48--62", month = oct, year = "2003", CODEN = "OSRED8", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @Article{Pozniansky:2003:EFD, author = "Eli Pozniansky and Assaf Schuster", title = "Efficient on-the-fly data race detection in multithreaded {C++} programs", journal = j-SIGPLAN, pages = "179--190", year = "2003", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Mon Dec 22 16:52:42 MST 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Prabhu:2003:UTL, author = "Manohar K. Prabhu and Kunle Olukotun", title = "Using thread-level speculation to simplify manual parallelization", journal = j-SIGPLAN, pages = "1--12", year = "2003", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Mon Dec 22 16:52:42 MST 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Price:2003:CAF, author = "Gregory W. Price and David K. Lowenthal", title = "A comparative analysis of fine-grain threads packages", journal = j-J-PAR-DIST-COMP, volume = "63", number = "11", pages = "1050--1063", month = nov, year = "2003", CODEN = "JPDCER", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Tue Dec 16 16:10:44 MST 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315", } @Article{Prvulovic:2003:RUT, author = "Milos Prvulovic and Josep Torrellas", title = "{ReEnact}: using thread-level speculation mechanisms to debug data races in multithreaded codes", journal = j-COMP-ARCH-NEWS, volume = "31", number = "2", pages = "110--121", month = may, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Rajwar:2003:TET, author = "Ravi Rajwar and James Goodman", title = "Transactional Execution: Toward Reliable, High-Performance Multithreading", journal = j-IEEE-MICRO, volume = "23", number = "6", pages = "117--125", month = nov # "\slash " # dec, year = "2003", CODEN = "IEMIDZ", DOI = "https://doi.org/10.1109/MM.2003.1261395", ISSN = "0272-1732 (print), 1937-4143 (electronic)", ISSN-L = "0272-1732", bibdate = "Sat Jan 31 07:23:55 MST 2004", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeemicro.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://csdl.computer.org/comp/mags/mi/2003/06/m6117abs.htm; http://csdl.computer.org/dl/mags/mi/2003/06/m6117.htm; http://csdl.computer.org/dl/mags/mi/2003/06/m6117.pdf", acknowledgement = ack-nhfb, fjournal = "IEEE Micro", journal-URL = "http://www.computer.org/csdl/mags/mi/index.html", } @Book{Robbins:2003:USP, author = "Kay A. Robbins and Steven Robbins", title = "{UNIX} Systems programming: communication, concurrency, and threads", publisher = pub-PHPTR, address = pub-PHPTR:adr, edition = "Second", pages = "xvii + 893", year = "2003", ISBN = "0-13-042411-0", ISBN-13 = "978-0-13-042411-2", LCCN = "QA76.76.O63 R6215 2003", bibdate = "Wed Aug 20 21:08:15 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, annote = "See \cite{Robbins:1996:PUP} for first edition.", keywords = "operating systems (computers); UNIX (computer file)", } @Article{Robison:2003:MCN, author = "Arch D. Robison", title = "Memory Consistency and {.NET}", journal = j-DDJ, volume = "28", number = "4", pages = "46, 48--50", month = apr, year = "2003", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Thu Jun 12 05:46:22 MDT 2003", bibsource = "http://www.ddj.com/articles/2003/0304/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.ddj.com/documents/s=7827/ddj0304e/", abstract = "Understanding the basics of memory consistency is essential to writing multithreaded code that works on both uniprocessors and multiprocessors.", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @Article{Solihin:2003:CPU, author = "Yan Solihin and Jaejin Lee and Josep Torrellas", title = "Correlation Prefetching with a User-Level Memory Thread", journal = j-IEEE-TRANS-PAR-DIST-SYS, volume = "14", number = "6", pages = "563--580", month = jun, year = "2003", CODEN = "ITDSEO", DOI = "https://doi.org/10.1109/TPDS.2003.1206504", ISSN = "1045-9219 (print), 1558-2183 (electronic)", ISSN-L = "1045-9219", bibdate = "Wed Dec 24 10:02:07 MST 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://csdl.computer.org/comp/trans/td/2003/06/l0563abs.htm; http://csdl.computer.org/dl/trans/td/2003/06/l0563.pdf", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Parallel and Distributed Systems", journal-URL = "http://www.computer.org/tpds/archives.htm", } @Article{Swanson:2003:ESI, author = "Steven Swanson and Luke K. McDowell and Michael M. Swift and Susan J. Eggers and Henry M. Levy", title = "An evaluation of speculative instruction execution on simultaneous multithreaded processors", journal = j-TOCS, volume = "21", number = "3", pages = "314--340", month = aug, year = "2003", CODEN = "ACSYEC", ISSN = "0734-2071 (print), 1557-7333 (electronic)", ISSN-L = "0734-2071", bibdate = "Thu Aug 7 10:13:26 MDT 2003", bibsource = "http://www.acm.org/pubs/contents/journals/tocs/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Computer Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J774", } @Article{Thulasiram:2003:PEM, author = "Ruppa K. Thulasiram and Parimala Thulasiraman", title = "Performance Evaluation of a Multithreaded {Fast Fourier Transform} Algorithm for Derivative Pricing", journal = j-J-SUPERCOMPUTING, volume = "26", number = "1", pages = "43--58", month = aug, year = "2003", CODEN = "JOSUED", ISSN = "0920-8542 (print), 1573-0484 (electronic)", ISSN-L = "0920-8542", bibdate = "Tue Dec 16 08:27:10 MST 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.wkap.nl/journalhome.htm/0920-8542", URL = "http://ipsapp009.kluweronline.com/content/getfile/5189/46/4/abstract.htm; http://ipsapp009.kluweronline.com/content/getfile/5189/46/4/fulltext.pdf", acknowledgement = ack-nhfb, fjournal = "The Journal of Supercomputing", journal-URL = "http://link.springer.com/journal/11227", } @Article{Timmerman:2003:EWC, author = "Martin Timmerman", title = "Examining {Windows CE .NET}", journal = j-DDJ, volume = "28", number = "2", pages = "62, 64", month = feb, year = "2003", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Thu Jun 12 05:46:21 MDT 2003", bibsource = "http://www.ddj.com/articles/2003/0302/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.ddj.com/documents/s=7790/ddj0302h/", abstract = "Martin examines Windows CE .NET's thread handling and advanced interrupt handling capabilities, as well as its synchronization mechanisms and network stack performance.", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @Article{Tremblay:2003:IEP, author = "G. Tremblay and C. J. Morrone and J. N. Amaral and G. R. Gao", title = "Implementation of the {EARTH} programming model on {SMP} clusters: a multi-threaded language and runtime system", journal = j-CCPE, volume = "15", number = "9", pages = "821--844", day = "10", month = aug, year = "2003", CODEN = "CCPEBO", DOI = "https://doi.org/10.1002/cpe.729", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Tue Jan 13 09:28:12 MST 2004", bibsource = "http://www.interscience.wiley.com/jpages/1532-0626; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www3.interscience.wiley.com/journalfinder.html", acknowledgement = ack-nhfb, fjournal = "Concurrency and Computation: Prac\-tice and Experience", journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626", onlinedate = "14 Jul 2003", } @Article{Tseng:2003:DST, author = "Y. Tseng and R. F. DeMara and P. J. Wilder", title = "Distributed-sum termination detection supporting multithreaded execution", journal = j-PARALLEL-COMPUTING, volume = "29", number = "7", pages = "953--968", month = jul, year = "2003", CODEN = "PACOEJ", ISSN = "0167-8191 (print), 1872-7336 (electronic)", ISSN-L = "0167-8191", bibdate = "Wed Dec 24 09:07:26 MST 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Parallel Computing", journal-URL = "http://www.sciencedirect.com/science/journal/01678191", } @Article{Ungerer:2003:SPE, author = "Theo Ungerer and Borut Robi{\v{c}} and Jurij {\v{S}}ilc", title = "A survey of processors with explicit multithreading", journal = j-COMP-SURV, volume = "35", number = "1", pages = "29--63", month = mar, year = "2003", CODEN = "CMSVAN", DOI = "https://doi.org/10.1145/641865.641867", ISSN = "0360-0300 (print), 1557-7341 (electronic)", ISSN-L = "0360-0300", bibdate = "Thu Jun 19 10:18:52 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/surveys/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Hardware multithreading is becoming a generally applied technique in the next generation of microprocessors. Several multithreaded processors are announced by industry or already into production in the areas of high-performance microprocessors, media, and network processors. A multithreaded processor is able to pursue two or more threads of control in parallel within the processor pipeline. The contexts of two or more threads of control are often stored in separate on-chip register sets. Unused instruction slots, which arise from latencies during the pipelined execution of single-threaded programs by a contemporary microprocessor, are filled by instructions of other threads within a multithreaded processor. The execution units are multiplexed between the thread contexts that are loaded in the register sets. Underutilization of a superscalar processor due to missing instruction-level parallelism can be overcome by simultaneous multithreading, where a processor can issue multiple instructions from multiple threads each cycle. Simultaneous multithreaded processors combine the multithreading technique with a wide-issue superscalar processor to utilize a larger part of the issue bandwidth by issuing instructions from different threads simultaneously. Explicit multithreaded processors are multithreaded processors that apply processes or operating system threads in their hardware thread slots. These processors optimize the throughput of multiprogramming workloads rather than single-thread performance. We distinguish these processors from implicit multithreaded processors that utilize thread-level speculation by speculatively executing compiler- or machine-generated threads of control that are part of a single sequential program. This survey paper explains and classifies the explicit multithreading techniques in research and in commercial microprocessors.", acknowledgement = ack-nhfb, fjournal = "ACM Computing Surveys", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J204", keywords = "Blocked multithreading; interleaved multithreading; simultaneous multithreading", } @Article{vonPraun:2003:SCA, author = "Christoph von Praun and Thomas R. Gross", title = "Static conflict analysis for multi-threaded object-oriented programs", journal = j-SIGPLAN, volume = "38", number = "5", pages = "115--128", month = may, year = "2003", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sat Oct 11 12:45:00 MDT 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @InProceedings{Watcharawitch:2003:MME, author = "Panit Watcharawitch and Simon W. Moore", title = "{MulTEP}: {MulTithreaded Embedded Processors}", crossref = "Anonymous:2003:CCV", pages = "??--??", year = "2003", bibdate = "Fri Jan 09 17:02:42 2004", bibsource = "http://www.coolchips.org/cool6/pdfDocuments/WEB05-Program_COOL6_2003.4.1.pdf; https://www.math.utah.edu/pub/tex/bib/cool-chips.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{White:2003:UTL, author = "Tom White", title = "Using Thread-Local Variables In {Java}", journal = j-DDJ, volume = "28", number = "7", pages = "42, 44--46", month = jul, year = "2003", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Thu Jun 12 05:46:24 MDT 2003", bibsource = "http://www.ddj.com/articles/2003/0307/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.ddj.com/ftp/2003/2003_07/thread.txt; http://www.ddj.com/ftp/2003/2003_07/thread.zip", abstract = "Java's ThreadLocal class provides a powerful, easy-to-use way to write efficient code that is safe for multithreaded access. Additional resources include thread.txt (listings) and thread.zip (source code).", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @Article{Yong:2003:AMC, author = "Xie Yong and Hsu Wen-Jing", title = "Aligned Multithreaded Computations and Their Scheduling with {FAB} Performance Guarantees", journal = j-PARALLEL-PROCESS-LETT, volume = "13", number = "3", pages = "353--??", month = sep, year = "2003", CODEN = "PPLTEE", ISSN = "0129-6264 (print), 1793-642X (electronic)", bibdate = "Thu Jan 06 09:41:03 2005", bibsource = "http://ejournals.wspc.com.sg/ppl/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Parallel Processing Letters", journal-URL = "http://www.worldscientific.com/loi/ppl", } @Article{Bhowmik:2004:GCF, author = "Anasua Bhowmik and Manoj Franklin", title = "A General Compiler Framework for Speculative Multithreaded Processors", journal = j-IEEE-TRANS-PAR-DIST-SYS, volume = "15", number = "8", pages = "713--724", month = aug, year = "2004", CODEN = "ITDSEO", DOI = "https://doi.org/10.1109/TPDS.2004.26", ISSN = "1045-9219 (print), 1558-2183 (electronic)", ISSN-L = "1045-9219", bibdate = "Sat Dec 11 16:24:15 MST 2004", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://csdl.computer.org/dl/trans/td/2004/08/l0713.htm; http://csdl.computer.org/dl/trans/td/2004/08/l0713.pdf", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Parallel and Distributed Systems", journal-URL = "http://www.computer.org/tpds/archives.htm", } @Article{Bouchenak:2004:EIE, author = "S. Bouchenak and D. Hagimont and S. Krakowiak and N. De Palma and F. Boyer", title = "Experiences implementing efficient {Java} thread serialization, mobility and persistence", journal = j-SPE, volume = "34", number = "4", pages = "355--393", day = "10", month = apr, year = "2004", CODEN = "SPEXBL", DOI = "https://doi.org/10.1002/spe.569", ISSN = "0038-0644 (print), 1097-024X (electronic)", ISSN-L = "0038-0644", bibdate = "Sat Apr 16 07:26:28 MDT 2005", bibsource = "http://www.interscience.wiley.com/jpages/0038-0644; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www3.interscience.wiley.com/journalfinder.html", acknowledgement = ack-nhfb, fjournal = "Software---Practice and Experience", journal-URL = "http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1097-024X", onlinedate = "5 Jan 2004", } @Article{Bucker:2004:TUC, author = "H. M. Bucker and B. Lang and H. J. Pflug and A. Vehreschild", title = "Threads in an Undergraduate Course: a {Java} Example Illuminating Different Multithreading Approaches", journal = j-LECT-NOTES-COMP-SCI, volume = "3044", pages = "882--891", year = "2004", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Tue Sep 28 15:27:39 MDT 2004", bibsource = "https://www.math.utah.edu/pub/tex/bib/java2000.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; Ingenta database", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Chang:2004:TSP, author = "B. M. Chang and J. D. Choi", title = "Thread-Sensitive Points-to Analysis for Multithreaded {Java} Programs", journal = j-LECT-NOTES-COMP-SCI, volume = "3280", pages = "945--954", year = "2004", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Mon Dec 6 06:44:22 MST 2004", bibsource = "https://www.math.utah.edu/pub/tex/bib/java2000.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; Ingenta database", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Chaudhuri:2004:SAN, author = "Mainak Chaudhuri and Mark Heinrich", title = "{SMTp}: {An Architecture} for {Next-generation Scalable Multi-threading}", journal = j-COMP-ARCH-NEWS, volume = "32", number = "2", pages = "124--124", month = mar, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Flanagan:2004:ADA, author = "Cormac Flanagan and Stephen N. Freund", title = "Atomizer: a dynamic atomicity checker for multithreaded programs", journal = j-SIGPLAN, volume = "39", number = "1", pages = "256--267", month = jan, year = "2004", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Apr 12 09:38:12 MDT 2005", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Flanagan:2004:EPA, author = "Cormac Flanagan and Stephen N. Freund and Shaz Qadeer", title = "Exploiting purity for atomicity", journal = j-SIGSOFT, volume = "29", number = "4", pages = "221--231", month = jul, year = "2004", CODEN = "SFENDP", DOI = "https://doi.org/10.1145/1013886.1007543", ISSN = "0163-5948 (print), 1943-5843 (electronic)", ISSN-L = "0163-5948", bibdate = "Wed Aug 1 17:14:35 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigsoft2000.bib", abstract = "The notion that certain procedures are atomic is a fundamental correctness property of many multithreaded software systems. A procedure is atomic if for every execution there is an equivalent serial execution in which the actions performed by any thread while executing the atomic procedure are not interleaved with actions of other threads. Several existing tools verify atomicity by using commutativity of actions to show that every execution reduces to a corresponding serial execution. However, experiments with these tools have highlighted a number of interesting procedures that, while intuitively atomic, are not reducible. In this paper, we exploit the notion of pure code blocks to verify the atomicity of such irreducible procedures. If a pure block terminates normally, then its evaluation does not change the program state, and hence these evaluation steps can be removed from the program trace before reduction. We develop a static analysis for atomicity based on this insight, and we illustrate this analysis on a number of interesting examples that could not be verified using earlier tools based purely on reduction. The techniques developed in this paper may also be applicable in other approaches for verifying atomicity, such as model checking and dynamic analysis.", acknowledgement = ack-nhfb, fjournal = "ACM SIGSOFT Software Engineering Notes", journal-URL = "https://dl.acm.org/citation.cfm?id=J728", } @Article{Georges:2004:JPR, author = "A. Georges and M. Christiaens and M. Ronsse and K. {De Bosschere}", title = "{JaRec}: a portable record\slash replay environment for multi-threaded {Java} applications", journal = j-SPE, volume = "34", number = "6", pages = "523--547", month = may, year = "2004", CODEN = "SPEXBL", DOI = "https://doi.org/10.1002/spe.579", ISSN = "0038-0644 (print), 1097-024X (electronic)", ISSN-L = "0038-0644", bibdate = "Sat Apr 16 07:26:29 MDT 2005", bibsource = "http://www.interscience.wiley.com/jpages/0038-0644; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www3.interscience.wiley.com/journalfinder.html", acknowledgement = ack-nhfb, fjournal = "Software---Practice and Experience", journal-URL = "http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1097-024X", onlinedate = "24 Feb 2004", } @Article{Johnson:2004:MCP, author = "Troy A. Johnson and Rudolf Eigenmann and T. N. Vijaykumar", title = "Min-cut program decomposition for thread-level speculation", journal = j-SIGPLAN, volume = "39", number = "6", pages = "59--70", month = may, year = "2004", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Thu Dec 2 05:49:55 MST 2004", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Johnston:2004:ADP, author = "Wesley M. Johnston and J. R. Paul Hanna and Richard J. Millar", title = "Advances in dataflow programming languages", journal = j-COMP-SURV, volume = "36", number = "1", pages = "1--34", month = mar, year = "2004", CODEN = "CMSVAN", DOI = "https://doi.org/10.1145/1013208.1013209", ISSN = "0360-0300 (print), 1557-7341 (electronic)", ISSN-L = "0360-0300", bibdate = "Thu Jun 19 10:19:47 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/surveys/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Many developments have taken place within dataflow programming languages in the past decade. In particular, there has been a great deal of activity and advancement in the field of dataflow visual programming languages. The motivation for this article is to review the content of these recent developments and how they came about. It is supported by an initial review of dataflow programming in the 1970s and 1980s that led to current topics of research. It then discusses how dataflow programming evolved toward a hybrid von Neumann dataflow formulation, and adopted a more coarse-grained approach. Recent trends toward dataflow visual programming languages are then discussed with reference to key graphical dataflow languages and their development environments. Finally, the article details four key open topics in dataflow programming languages.", acknowledgement = ack-nhfb, fjournal = "ACM Computing Surveys", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J204", keywords = "co-ordination languages; component software; data flow visual programming; Dataflow; graphical programming; multithreading; software engineering", } @Article{Kalla:2004:IPC, author = "Ron Kalla and Balaram Sinharoy and Joel M. Tendler", title = "{IBM Power5} Chip: a Dual-Core Multithreaded Processor", journal = j-IEEE-MICRO, volume = "24", number = "2", pages = "40--47", month = mar # "\slash " # apr, year = "2004", CODEN = "IEMIDZ", DOI = "https://doi.org/10.1109/MM.2004.1289290", ISSN = "0272-1732 (print), 1937-4143 (electronic)", ISSN-L = "0272-1732", bibdate = "Sat Dec 11 17:59:16 MST 2004", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://csdl.computer.org/comp/mags/mi/2004/02/m2040abs.htm; http://csdl.computer.org/dl/mags/mi/2004/02/m2040.htm; http://csdl.computer.org/dl/mags/mi/2004/02/m2040.pdf", acknowledgement = ack-nhfb, fjournal = "IEEE Micro", journal-URL = "http://www.computer.org/csdl/mags/mi/index.html", } @Article{Kapil:2004:CMP, author = "Sanjiv Kapil and Harlan McGhan and Jesse Lawrendra", title = "A Chip Multithreaded Processor for Network-Facing Workloads", journal = j-IEEE-MICRO, volume = "24", number = "2", pages = "20--30", month = mar # "\slash " # apr, year = "2004", CODEN = "IEMIDZ", DOI = "https://doi.org/10.1109/MM.2004.1289288", ISSN = "0272-1732 (print), 1937-4143 (electronic)", ISSN-L = "0272-1732", bibdate = "Sat Dec 11 17:59:16 MST 2004", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://csdl.computer.org/comp/mags/mi/2004/02/m2020abs.htm; http://csdl.computer.org/dl/mags/mi/2004/02/m2020.htm; http://csdl.computer.org/dl/mags/mi/2004/02/m2020.pdf", acknowledgement = ack-nhfb, fjournal = "IEEE Micro", journal-URL = "http://www.computer.org/csdl/mags/mi/index.html", } @Article{Kee:2004:MMM, author = "Yang-Suk Kee and Jin-Soo Kim and Soonhoi Ha", title = "Memory management for multi-threaded software {DSM} systems", journal = j-PARALLEL-COMPUTING, volume = "30", number = "1", pages = "121--138", month = jan, year = "2004", CODEN = "PACOEJ", ISSN = "0167-8191 (print), 1872-7336 (electronic)", ISSN-L = "0167-8191", bibdate = "Sun Nov 7 05:53:52 MST 2004", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.sciencedirect.com/science/journal/01678191", acknowledgement = ack-nhfb, fjournal = "Parallel Computing", journal-URL = "http://www.sciencedirect.com/science/journal/01678191", } @Article{Krashinsky:2004:VTAa, author = "Ronny Krashinsky and Christopher Batten and Mark Hampton and Steve Gerding and Brian Pharris and Jared Casper and Krste Asanovic", title = "The Vector-Thread Architecture", journal = j-COMP-ARCH-NEWS, volume = "32", number = "2", pages = "52--52", month = mar, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Krashinsky:2004:VTAb, author = "Ronny Krashinsky and Christopher Batten and Mark Hampton and Steve Gerding and Brian Pharris and Jared Casper and Krste Asanovic", title = "The Vector-Thread Architecture", journal = j-IEEE-MICRO, volume = "24", number = "6", pages = "84--90", month = nov # "\slash " # dec, year = "2004", CODEN = "IEMIDZ", DOI = "https://doi.org/10.1109/MM.2004.90", ISSN = "0272-1732 (print), 1937-4143 (electronic)", ISSN-L = "0272-1732", bibdate = "Wed Apr 20 08:11:28 MDT 2005", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://csdl.computer.org/dl/mags/mi/2004/06/m6084.htm; http://csdl.computer.org/dl/mags/mi/2004/06/m6084.pdf", acknowledgement = ack-nhfb, fjournal = "IEEE Micro", journal-URL = "http://www.computer.org/csdl/mags/mi/index.html", } @Article{Kumar:2004:AST, author = "Nagendra J. Kumar and Siddhartha Shivshankar and Alexander G. Dean", title = "Asynchronous software thread integration for efficient software", journal = j-SIGPLAN, volume = "39", number = "7", pages = "37--46", month = jul, year = "2004", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Thu Dec 2 05:49:55 MST 2004", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Kumar:2004:SIH, author = "Rakesh Kumar and Dean M. Tullsen and Parthasarathy Ranganathan and Norman P. Jouppi and Keith I. Farkas", title = "Single-{ISA} Heterogeneous Multi-Core Architectures for Multithreaded Workload Performance", journal = j-COMP-ARCH-NEWS, volume = "32", number = "2", pages = "64--64", month = mar, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Lemon:2004:MCR, author = "Oliver Lemon and Alexander Gruenstein", title = "Multithreaded context for robust conversational interfaces: {Context-sensitive} speech recognition and interpretation of corrective fragments", journal = j-TOCHI, volume = "11", number = "3", pages = "241--267", month = sep, year = "2004", CODEN = "ATCIF4", ISSN = "1073-0516 (print), 1557-7325 (electronic)", ISSN-L = "1073-0516", bibdate = "Thu Nov 4 08:26:36 MST 2004", bibsource = "http://www.acm.org/pubs/contents/journals/tochi/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/tochi.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Computer-Human Interaction", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J756", } @Article{Li:2004:FRT, author = "S. Q. Li and H. Y. Chen and Y. X. Su", title = "A Framework of Reachability Testing for {Java} Multithread Programs", journal = "IEEE International Conference on Systems Man and Cybernetics", volume = "3", pages = "2730--2734", year = "2004", CODEN = "????", ISSN = "1062-922X", bibdate = "Thu Mar 24 17:43:34 MST 2005", bibsource = "https://www.math.utah.edu/pub/tex/bib/java2000.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; Ingenta database", acknowledgement = ack-nhfb, } @Article{Maris:2004:CCP, author = "Justin T. Maris and Aaron W. Keen and Takashi Ishihara and Ronald A. Olsson", title = "A comparison of concurrent programming and cooperative multithreading under load balancing applications", journal = j-CCPE, volume = "16", number = "4", pages = "345--369", day = "10", month = apr, year = "2004", CODEN = "CCPEBO", DOI = "https://doi.org/10.1002/cpe.751", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Sat May 14 11:30:53 MDT 2005", bibsource = "http://www.interscience.wiley.com/jpages/1532-0626; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www3.interscience.wiley.com/journalfinder.html", acknowledgement = ack-nhfb, fjournal = "Concurrency and Computation: Prac\-tice and Experience", journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626", onlinedate = "19 Jan 2004", } @Article{Marowka:2004:OOA, author = "Ami Marowka and Zhenying Liu and Barbara Chapman", title = "{OpenMP}-oriented applications for distributed shared memory architectures", journal = j-CCPE, volume = "16", number = "4", pages = "371--384", day = "10", month = apr, year = "2004", CODEN = "CCPEBO", DOI = "https://doi.org/10.1002/cpe.752", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Sat May 14 11:30:53 MDT 2005", bibsource = "http://www.interscience.wiley.com/jpages/1532-0626; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www3.interscience.wiley.com/journalfinder.html", acknowledgement = ack-nhfb, fjournal = "Concurrency and Computation: Prac\-tice and Experience", journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626", onlinedate = "19 Jan 2004", } @Article{Martin:2004:HPA, author = "Mar{\'\i}a J. Mart{\'\i}n and Marta Parada and Ram{\'o}n Doallo", title = "High Performance Air Pollution Simulation Using {OpenMP}", journal = j-J-SUPERCOMPUTING, volume = "28", number = "3", pages = "311--321", month = jun, year = "2004", CODEN = "JOSUED", ISSN = "0920-8542 (print), 1573-0484 (electronic)", ISSN-L = "0920-8542", bibdate = "Sat Dec 4 12:39:13 MST 2004", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.wkap.nl/journalhome.htm/0920-8542", URL = "http://ipsapp008.kluweronline.com/IPS/content/ext/x/J/5189/I/54/A/5/abstract.htm", acknowledgement = ack-nhfb, fjournal = "The Journal of Supercomputing", journal-URL = "http://link.springer.com/journal/11227", } @Article{Michael:2004:SLF, author = "Maged M. Michael", title = "Scalable lock-free dynamic memory allocation", journal = j-SIGPLAN, volume = "39", number = "6", pages = "35--46", month = may, year = "2004", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/996841.996848", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Thu Dec 2 05:49:55 MST 2004", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Dynamic memory allocators (malloc/free) rely on mutual exclusion locks for protecting the consistency of their shared data structures under multithreading. The use of locking has many disadvantages with respect to performance, availability, robustness, and programming flexibility. A lock-free memory allocator guarantees progress regardless of whether some threads are delayed or even killed and regardless of scheduling policies. This paper presents a completely lock-free memory allocator. It uses only widely-available operating system support and hardware atomic instructions. It offers guaranteed availability even under arbitrary thread termination and crash-failure, and it is immune to deadlock regardless of scheduling policies, and hence it can be used even in interrupt handlers and real-time applications without requiring special scheduler support. Also, by leveraging some high-level structures from Hoard, our allocator is highly scalable, limits space blowup to a constant factor, and is capable of avoiding false sharing. In addition, our allocator allows finer concurrency and much lower latency than Hoard. We use PowerPC shared memory multiprocessor systems to compare the performance of our allocator with the default AIX 5.1 libc malloc, and two widely-used multithread allocators, Hoard and Ptmalloc. Our allocator outperforms the other allocators in virtually all cases and often by substantial margins, under various levels of parallelism and allocation patterns. Furthermore, our allocator also offers the lowest contention-free latency among the allocators by significant margins.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Omma:2004:BMA, author = "M. Omma", title = "On building multithreaded applications", journal = j-IEEE-DISTRIB-SYST-ONLINE, volume = "5", number = "4", pages = "1--3", month = apr, year = "2004", CODEN = "????", DOI = "https://doi.org/10.1109/MDSO.2004.1301256", ISSN = "1541-4922 (print), 1558-1683 (electronic)", ISSN-L = "1541-4922", bibdate = "Fri Jul 15 17:50:15 MDT 2005", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://ieeexplore.ieee.org/iel5/8968/28913/01301256.pdf?isnumber=28913&prod=JNL&arnumber=1301256&arSt=+1&ared=+3&arAuthor=Omma%2C+M.; http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=28913&arnumber=1301256&count=5&index=3", acknowledgement = ack-nhfb, fjournal = "IEEE Distributed Systems Online", } @Article{Pfeffer:2004:RTG, author = "M. Pfeffer and T. Ungerer and S. Fuhrmann and J. Kreuzinger and U. Brinkschulte", title = "Real-Time Garbage Collection for a Multithreaded {Java} Microcontroller", journal = j-REAL-TIME-SYST, volume = "26", number = "1", pages = "89--106", year = "2004", CODEN = "RESYE9", ISSN = "0922-6443", ISSN-L = "0922-6443", bibdate = "Mon Jan 5 17:25:38 MST 2004", bibsource = "https://www.math.utah.edu/pub/tex/bib/java2000.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; Ingenta database", acknowledgement = ack-nhfb, fjournal = "Real-Time Systems", } @Article{Robatmili:2004:TSI, author = "B. Robatmili and N. Yazdani and S. Sardashti and M. Nourani", title = "Thread-Sensitive Instruction Issue for {SMT} Processors", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "3", number = "1", pages = "5--5", month = jan, year = "2004", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2004.9", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Simultaneous Multi Threading (SMT) is a processor design method in which concurrent hardware threads share processor resources like functional units and memory. The scheduling complexity and performance of an SMT processor depend on the topology used in the fetch and issue stages. In this paper, we propose a thread sensitive issue policy for a partitioned SMT processor which is based on a thread metric. We propose the number of ready-to-issue instructions of each thread as priority metric. To evaluate our method, we have developed a reconfigurable SMT-simulator on top of the SimpleScalar Toolset. We simulated our modeled processor under several workloads composed of SPEC benchmarks. Experimental results show around 30\% improvement compared to the conventional OLDEST\_FIRST mixed topology issue policy. Additionally, the hardware implementation of our architecture with this metric in issue stage is quite simple.", acknowledgement = ack-nhfb, fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Clocks; Delay; Frequency; Intrusion detection; Laboratories; Logic; Processor scheduling; Surface-mount technology; Topology", } @Article{Roth:2004:MTC, author = "Marcus Roth and Gerrit Voss and Dirk Reiners", title = "Multi-threading and clustering for scene graph systems", journal = j-COMPUTERS-AND-GRAPHICS, volume = "28", number = "1", pages = "63--66", month = feb, year = "2004", CODEN = "COGRD2", ISSN = "0097-8493 (print), 1873-7684 (electronic)", ISSN-L = "0097-8493", bibdate = "Tue Jan 27 12:04:28 MST 2004", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.sciencedirect.com/science/journal/00978493", acknowledgement = ack-nhfb, fjournal = "Computers and Graphics", journal-URL = "http://www.sciencedirect.com/science/journal/00978493", } @Article{Sanden:2004:CJT, author = "B. Sanden", title = "Coping with {Java} Threads: {Java} works for many kinds of concurrent software, but it was not designed for safety-critical real-time applications and does not protect the programmer from the pitfalls associated with multithreading", journal = j-COMPUTER, volume = "37", number = "4", pages = "20--27", year = "2004", CODEN = "CPTRB4", ISSN = "0018-9162 (print), 1558-0814 (electronic)", ISSN-L = "0018-9162", bibdate = "Mon May 17 14:50:36 MDT 2004", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; Ingenta database", acknowledgement = ack-nhfb, fjournal = "Computer", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2", } @Article{Shin:2004:NAD, author = "Chulho Shin and Seong-Won Lee and Jean-Luc Gaudiot", title = "The Need for Adaptive Dynamic Thread Scheduling in Simultaneous Multithreading", journal = j-PARALLEL-PROCESS-LETT, volume = "14", number = "3/4", pages = "327--??", month = sep # "\slash " # dec, year = "2004", CODEN = "PPLTEE", ISSN = "0129-6264 (print), 1793-642X (electronic)", bibdate = "Thu Jul 7 07:41:25 MDT 2005", bibsource = "http://ejournals.wspc.com.sg/ppl/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Parallel Processing Letters", journal-URL = "http://www.worldscientific.com/loi/ppl", } @Article{Thulasiraman:2004:FGL, author = "Parimala Thulasiraman and Ashfaq A. Khokhar and Gerd Heber and Guang R. Gao", title = "A fine-grain load-adaptive algorithm of the {$2$D} discrete wavelet transform for multithreaded architectures", journal = j-J-PAR-DIST-COMP, volume = "64", number = "1", pages = "68--78", month = jan, year = "2004", CODEN = "JPDCER", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Sat Dec 4 15:15:08 MST 2004", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.sciencedirect.com/science/journal/07437315", acknowledgement = ack-nhfb, fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315", } @Article{Tolmach:2004:IFL, author = "Andrew Tolmach and Sergio Antoy and Marius Nita", title = "Implementing functional logic languages using multiple threads and stores", journal = j-SIGPLAN, volume = "39", number = "9", pages = "90--102", month = sep, year = "2004", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Thu Dec 2 05:49:56 MST 2004", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Vrenios:2004:PPC, author = "A. Vrenios", title = "{Parallel Programming in C with MPI and OpenMP} [Book Review]", journal = j-IEEE-DISTRIB-SYST-ONLINE, volume = "5", number = "1", pages = "7.1--7.3", month = "????", year = "2004", CODEN = "????", ISSN = "1541-4922 (print), 1558-1683 (electronic)", ISSN-L = "1541-4922", bibdate = "Fri Jul 15 17:50:13 MDT 2005", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://ieeexplore.ieee.org/iel5/8968/28452/01270716.pdf?isnumber=28452&prod=JNL&arnumber=1270716&arSt=+7.1&ared=+7.3&arAuthor=Vrenios%2C+A.; http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=28452&arnumber=1270716&count=8&index=5", acknowledgement = ack-nhfb, fjournal = "IEEE Distributed Systems Online", } @Article{Wang:2004:HTVa, author = "Perry H. Wang and Jamison D. Collins and Hong Wang and Dongkeun Kim and Bill Greene and Kai-Ming Chan and Aamir B. Yunus and Terry Sych and Stephen F. Moore and John P. Shen", title = "Helper threads via virtual multithreading on an experimental {Itanium-2} processor-based platform", journal = j-COMP-ARCH-NEWS, volume = "32", number = "5", pages = "144--155", month = dec, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:24 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Wang:2004:HTVb, author = "Perry H. Wang and Jamison D. Collins and Hong Wang and Dongkeun Kim and Bill Greene and Kai-Ming Chan and Aamir B. Yunus and Terry Sych and Stephen F. Moore and John P. Shen", title = "Helper threads via virtual multithreading on an experimental {Itanium-2} processor-based platform", journal = j-SIGPLAN, volume = "39", number = "11", pages = "144--155", month = nov, year = "2004", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Apr 12 09:38:13 MDT 2005", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Wang:2004:HTVc, author = "Perry H. Wang and Jamison D. Collins and Hong Wang and Dongkeun Kim and Bill Greene and Kai-Ming Chan and Aamir B. Yunus and Terry Sych and Stephen F. Moore and John P. Shen", title = "Helper threads via virtual multithreading on an experimental {Itanium-2} processor-based platform", journal = j-OPER-SYS-REV, volume = "38", number = "5", pages = "144--155", month = dec, year = "2004", CODEN = "OSRED8", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:56 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGOPS Operating Systems Review", } @Article{Wang:2004:HTVd, author = "Perry H. Wang and Jamison D. Collins and Hong Wang and Dongkeun Kim and Bill Greene and Kai-Ming Chan and Aamir B. Yunus and Terry Sych and Stephen F. Moore and John P. Shen", title = "Helper Threads via Virtual Multithreading", journal = j-IEEE-MICRO, volume = "24", number = "6", pages = "74--82", month = nov # "\slash " # dec, year = "2004", CODEN = "IEMIDZ", DOI = "https://doi.org/10.1109/MM.2004.75", ISSN = "0272-1732 (print), 1937-4143 (electronic)", ISSN-L = "0272-1732", bibdate = "Wed Apr 20 08:11:28 MDT 2005", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://csdl.computer.org/dl/mags/mi/2004/06/m6074.htm; http://csdl.computer.org/dl/mags/mi/2004/06/m6074.pdf", acknowledgement = ack-nhfb, fjournal = "IEEE Micro", journal-URL = "http://www.computer.org/csdl/mags/mi/index.html", } @Article{Zhuang:2004:BRA, author = "Xiaotong Zhuang and Santosh Pande", title = "Balancing register allocation across threads for a multithreaded network processor", journal = j-SIGPLAN, volume = "39", number = "6", pages = "289--300", month = may, year = "2004", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Thu Dec 2 05:49:55 MST 2004", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Abraham:2005:ABP, author = "Erika {\'A}brah{\'a}m and Frank S. de Boer and Willem-Paul de Roever and Martin Steffen", title = "An assertion-based proof system for multithreaded {Java}", journal = j-THEOR-COMP-SCI, volume = "331", number = "2--3", pages = "251--290", day = "25", month = feb, year = "2005", CODEN = "TCSCDI", ISSN = "0304-3975 (print), 1879-2294 (electronic)", ISSN-L = "0304-3975", bibdate = "Fri Jul 8 14:05:15 MDT 2005", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.sciencedirect.com/science/journal/03043975", acknowledgement = ack-nhfb, fjournal = "Theoretical Computer Science", journal-URL = "http://www.sciencedirect.com/science/journal/03043975", } @Article{Anonymous:2005:ECS, author = "Anonymous", title = "Errata: {{\em Characterization of Simultaneous Multithreading (SMT) Efficiency in POWER5}}", journal = j-IBM-JRD, volume = "49", number = "6", pages = "1003--??", month = nov, year = "2005", CODEN = "IBMJAE", ISSN = "0018-8646 (print), 2151-8556 (electronic)", ISSN-L = "0018-8646", bibdate = "Fri Feb 9 21:39:23 MST 2007", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.research.ibm.com/journal/", note = "See \cite{Mathis:2005:CSM}.", URL = "http://www.research.ibm.com/journal/rd/496/errata.html", acknowledgement = ack-nhfb, fjournal = "IBM Journal of Research and Development", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5288520", ordernumber = "G322-0245-00", } @Article{Barabash:2005:PIM, author = "Katherine Barabash and Ori Ben-Yitzhak and Irit Goft and Elliot K. Kolodner and Victor Leikehman and Yoav Ossia and Avi Owshanko and Erez Petrank", title = "A parallel, incremental, mostly concurrent garbage collector for servers", journal = j-TOPLAS, volume = "27", number = "6", pages = "1097--1146", month = nov, year = "2005", CODEN = "ATPSDT", DOI = "https://doi.org/10.1145/1108970.1108972", ISSN = "0164-0925 (print), 1558-4593 (electronic)", ISSN-L = "0164-0925", bibdate = "Wed Jan 11 05:23:15 MST 2006", bibsource = "http://www.acm.org/pubs/contents/journals/toplas/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Multithreaded applications with multigigabyte heaps running on modern servers provide new challenges for garbage collection (GC). The challenges for ``server-oriented'' GC include: ensuring short pause times on a multigigabyte heap while minimizing throughput penalty, good scaling on multiprocessor hardware, and keeping the number of expensive multicycle fence instructions required by weak ordering to a minimum. We designed and implemented a collector facing these demands building on the mostly concurrent garbage collector proposed by Boehm et al. [1991]. Our collector incorporates new ideas into the original collector. We make it parallel and incremental; we employ concurrent low-priority background GC threads to take advantage of processor idle time; we propose novel algorithmic improvements to the basic mostly concurrent algorithm improving its efficiency and shortening its pause times; and finally, we use advanced techniques, such as a low-overhead work packet mechanism to enable full parallelism among the incremental and concurrent collecting threads and ensure load balancing. We compared the new collector to the mature, well-optimized, parallel, stop-the-world mark-sweep collector already in the IBM JVM. When allowed to run aggressively, using 72\% of the CPU utilization during a short concurrent phase, our collector prototype reduces the maximum pause time from 161 ms to 46 ms while only losing 11.5\% throughput when running the SPECjbb2000 benchmark on a 600-MB heap on an 8-way PowerPC 1.1-GHz processors. When the collector is limited to a nonintrusive operation using only 29\% of the CPU utilization, the maximum pause time obtained is 79 ms and the loss in throughput is 15.4\%.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Programming Languages and Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783", } @Article{Basharahil:2005:DSA, author = "Ramzi Basharahil and Brian Wims and Cheng-Zhong Xu and Song Fu", title = "Distributed Shared Arrays: An Integration of Message Passing and Multithreading on {SMP} Clusters", journal = j-J-SUPERCOMPUTING, volume = "31", number = "2", pages = "161--184", month = feb, year = "2005", CODEN = "JOSUED", DOI = "https://doi.org/10.1007/s11227-005-0041-5", ISSN = "0920-8542 (print), 1573-0484 (electronic)", ISSN-L = "0920-8542", bibdate = "Wed Jul 6 10:36:19 MDT 2005", bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=31&issue=2; https://www.math.utah.edu/pub/tex/bib/jsuper.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=31&issue=2&spage=161", acknowledgement = ack-nhfb, fjournal = "The Journal of Supercomputing", journal-URL = "http://link.springer.com/journal/11227", } @Article{Boehm:2005:TCI, author = "Hans-J. Boehm", title = "Threads cannot be implemented as a library", journal = j-SIGPLAN, volume = "40", number = "6", pages = "261--268", month = jun, year = "2005", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1065010.1065042", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Jun 21 17:04:05 MDT 2005", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "In many environments, multi-threaded code is written in a language that was originally designed without thread support (e.g. C), to which a library of threading primitives was subsequently added. There appears to be a general understanding that this is not the right approach. We provide specific arguments that a pure library approach, in which the compiler is designed independently of threading issues, cannot guarantee correctness of the resulting code. We first review why the approach almost works, and then examine some of the surprising behavior it may entail. We further illustrate that there are very simple cases in which a pure library-based approach seems incapable of expressing an efficient parallel algorithm. Our discussion takes place in the context of C with Pthreads, since it is commonly used, reasonably well specified, and does not attempt to ensure type-safety, which would entail even stronger constraints. The issues we raise are not specific to that context.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "This is an important paper: it shows that current languages cannot be reliable for threaded programming without language changes that prevent compiler optimizations from foiling synchronization methods and memory barriers. The article's author and others are collaborating on a proposal for changes to the C++ language to remedy this, but that still leaves threads unreliable in C code, even with POSIX threads.", } @Article{Boroday:2005:DAJ, author = "S. Boroday and A. Petrenko and J. Singh and H. Hallal", title = "Dynamic analysis of {Java} applications for multithreaded antipatterns", journal = j-SIGSOFT, volume = "30", number = "4", pages = "1--7", month = jul, year = "2005", CODEN = "SFENDP", DOI = "https://doi.org/10.1145/1082983.1083247", ISSN = "0163-5948 (print), 1943-5843 (electronic)", ISSN-L = "0163-5948", bibdate = "Wed Aug 1 17:14:51 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/java2000.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigsoft2000.bib", abstract = "Formal verification is not always applicable to large industrial software systems due to scalability issues and difficulties in formal model and requirements specification. The scalability and model derivation problems could be alleviated by runtime trace analysis, which combines both testing and formal verification. We implement and compare an ad-hoc custom approach and a formal approach to detect common bug patterns in multithreaded Java software. We use the tracing platform of the Eclipse IDE and state-of-the-art model checker Spin.", acknowledgement = ack-nhfb, fjournal = "ACM SIGSOFT Software Engineering Notes", journal-URL = "https://dl.acm.org/citation.cfm?id=J728", } @Article{Brinkschulte:2005:ICA, author = "U. Brinkschulte and M. Pacher", title = "Implementing Control Algorithms Within a Multithreaded {Java} Microcontroller", journal = j-LECT-NOTES-COMP-SCI, volume = "3432", pages = "33--49", year = "2005", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Tue Apr 26 10:50:23 MDT 2005", bibsource = "https://www.math.utah.edu/pub/tex/bib/java2000.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; Ingenta database", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Constantinou:2005:PIS, author = "Theofanis Constantinou and Yiannakis Sazeides and Pierre Michaud and Damien Fetis and Andre Seznec", title = "Performance implications of single thread migration on a chip multi-core", journal = j-COMP-ARCH-NEWS, volume = "33", number = "4", pages = "80--91", month = nov, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:08 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Flanagan:2005:MVM, author = "Cormac Flanagan and Stephen N. Freund and Shaz Qadeer and Sanjit A. Seshia", title = "Modular verification of multithreaded programs", journal = j-THEOR-COMP-SCI, volume = "338", number = "1--3", pages = "153--183", day = "10", month = jun, year = "2005", CODEN = "TCSCDI", ISSN = "0304-3975 (print), 1879-2294 (electronic)", ISSN-L = "0304-3975", bibdate = "Fri Jul 8 14:05:16 MDT 2005", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.sciencedirect.com/science/journal/03043975", acknowledgement = ack-nhfb, fjournal = "Theoretical Computer Science", journal-URL = "http://www.sciencedirect.com/science/journal/03043975", } @TechReport{Garcia:2005:HJA, author = "P. Garcia and H. F. Korth", title = "Hash-join algorithms on modern multithreaded computer architectures", type = "Report", number = "LUCSE-05-001", institution = "Lehigh University", address = "Bethlehem, PA, USA", month = "????", year = "2005", bibdate = "Mon Dec 10 07:05:38 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Giampapa:2005:BGA, author = "M. E. Giampapa and R. Bellofatto and M. A. Blumrich and D. Chen and M. B. Dombrowa and A. Gara and R. A. Haring and P. Heidelberger and D. Hoenicke and G. V. Kopcsay and B. J. Nathanson and B. D. Steinmacher-Burow and M. Ohmacht and V. Salapura and P. Vranas", title = "{Blue Gene/L} advanced diagnostics environment", journal = j-IBM-JRD, volume = "49", number = "2/", pages = "319--331", month = "????", year = "2005", CODEN = "IBMJAE", ISSN = "0018-8646 (print), 2151-8556 (electronic)", ISSN-L = "0018-8646", bibdate = "Wed Jun 1 08:14:41 MDT 2005", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.research.ibm.com/journal/", URL = "http://www.research.ibm.com/journal/rd/492/giampapa.pdf", abstract = "This paper describes the Blue Gene/L advanced diagnostics environment (ADE) used throughout all aspects of the Blue Gene/L project, including design, logic verification, bringup, diagnostics, and manufacturing test. The Blue Gene/L ADE consists of a lightweight multithreaded coherence-managed kernel, runtime libraries, device drivers, system programming interfaces, compilers, and host-based development tools. It provides complete and flexible access to all features of the Blue Gene/L hardware. Prior to the existence of hardware, ADE was used on Very high-speed integrated circuit Hardware Description Language (VHDL) models, not only for logic verification, but also for performance measurements, code-path analysis, and evaluation of architectural tradeoffs. During early hardware bring-up, the ability to run in a cycle-reproducible manner on both hardware and VHDL proved invaluable in fault isolation and analysis. However, ADE is also capable of supporting high-performance applications and parallel test cases, thereby permitting us to stress the hardware to the limits of its capabilities. This paper also provides insights into system-level and device-level programming of Blue Gene/L to assist developers of high-performance applications to more fully exploit the performance of the machine.", acknowledgement = ack-nhfb, fjournal = "IBM Journal of Research and Development", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5288520", ordernumber = "G322-0240", } @Article{Gil:2005:TCS, author = "Marisa Gil and Ruben Pinilla", title = "Thread coloring: a scheduler proposal from user to hardware threads", journal = j-OPER-SYS-REV, volume = "39", number = "2", pages = "54--70", month = apr, year = "2005", CODEN = "OSRED8", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @Article{Gustafsson:2005:TP, author = "Andreas Gustafsson", title = "Threads without the pain", journal = j-QUEUE, volume = "3", number = "9", pages = "42--47", month = nov, year = "2005", CODEN = "AQCUAE", ISSN = "1542-7730 (print), 1542-7749 (electronic)", ISSN-L = "1542-7730", bibdate = "Sat Dec 17 07:37:28 MST 2005", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM Queue: Tomorrow's Computing Today", } @Article{Keller:2005:TBV, author = "J{\"o}rg Keller and Andreas Gr{\"a}vinghoff", title = "Thread-Based Virtual Duplex Systems in Embedded Environments", journal = j-IEEE-MICRO, volume = "25", number = "2", pages = "60--69", month = mar # "\slash " # apr, year = "2005", CODEN = "IEMIDZ", DOI = "https://doi.org/10.1109/MM.2005.39", ISSN = "0272-1732 (print), 1937-4143 (electronic)", ISSN-L = "0272-1732", bibdate = "Wed Apr 20 08:11:29 MDT 2005", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://csdl.computer.org/comp/mags/mi/2005/02/m2060abs.htm; http://csdl.computer.org/dl/mags/mi/2005/02/m2060.pdf", acknowledgement = ack-nhfb, fjournal = "IEEE Micro", journal-URL = "http://www.computer.org/csdl/mags/mi/index.html", } @Article{Kongetira:2005:NWM, author = "Poonacha Kongetira and Kathirgamar Aingaran and Kunle Olukotun", title = "{Niagara}: a 32-Way Multithreaded {Sparc} Processor", journal = j-IEEE-MICRO, volume = "25", number = "2", pages = "21--29", month = mar # "\slash " # apr, year = "2005", CODEN = "IEMIDZ", DOI = "https://doi.org/10.1109/MM.2005.35", ISSN = "0272-1732 (print), 1937-4143 (electronic)", ISSN-L = "0272-1732", bibdate = "Wed Apr 20 08:11:29 MDT 2005", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://csdl.computer.org/comp/mags/mi/2005/02/m2021abs.htm; http://csdl.computer.org/dl/mags/mi/2005/02/m2021.pdf", acknowledgement = ack-nhfb, fjournal = "IEEE Micro", journal-URL = "http://www.computer.org/csdl/mags/mi/index.html", } @Article{Li:2005:OSA, author = "Xiaoye S. Li", title = "An overview of {SuperLU}: {Algorithms}, implementation, and user interface", journal = j-TOMS, volume = "31", number = "3", pages = "302--325", month = sep, year = "2005", CODEN = "ACMSCU", DOI = "https://doi.org/10.1145/1089014.1089017", ISSN = "0098-3500 (print), 1557-7295 (electronic)", ISSN-L = "0098-3500", bibdate = "Wed Oct 5 07:43:35 MDT 2005", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "We give an overview of the algorithms, design philosophy, and implementation techniques in the software SuperLU, for solving sparse unsymmetric linear systems. In particular, we highlight the differences between the sequential SuperLU (including its multithreaded extension) and parallel SuperLU_DIST. These include the numerical pivoting strategy, the ordering strategy for preserving sparsity, the ordering in which the updating tasks are performed, the numerical kernel, and the parallelization strategy. Because of the scalability concern, the parallel code is drastically different from the sequential one. We describe the user interfaces of the libraries, and illustrate how to use the libraries most efficiently depending on some matrix characteristics. Finally, we give some examples of how the solver has been used in large-scale scientific applications, and the performance.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Mathematical Software (TOMS)", journal-URL = "http://dl.acm.org/pub.cfm?id=J782", } @Article{Loepere:2005:STM, author = "Keith Loepere", title = "Stackable thread mechanisms", journal = j-OPER-SYS-REV, volume = "39", number = "4", pages = "4--17", month = oct, year = "2005", CODEN = "OSRED8", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:53 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @Article{Mathis:2005:CSM, author = "H. M. Mathis and A. E. Mericas and J. D. McCalpin and R. J. Eickemeyer and S. R. Kunkel", title = "Characterization of simultaneous multithreading ({SMT}) efficiency in {POWER5}", journal = j-IBM-JRD, volume = "49", number = "4/5", pages = "555--564", month = "????", year = "2005", CODEN = "IBMJAE", ISSN = "0018-8646 (print), 2151-8556 (electronic)", ISSN-L = "0018-8646", bibdate = "Wed Oct 5 07:12:31 MDT 2005", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.research.ibm.com/journal/", URL = "http://www.research.ibm.com/journal/rd/494/mathis.html", acknowledgement = ack-nhfb, fjournal = "IBM Journal of Research and Development", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5288520", } @Article{McNairy:2005:MDC, author = "Cameron McNairy and Rohit Bhatia", title = "{Montecito}: a Dual-Core, Dual-Thread {Itanium} Processor", journal = j-IEEE-MICRO, volume = "25", number = "2", pages = "10--20", month = mar # "\slash " # apr, year = "2005", CODEN = "IEMIDZ", DOI = "https://doi.org/10.1109/MM.2005.34", ISSN = "0272-1732 (print), 1937-4143 (electronic)", ISSN-L = "0272-1732", bibdate = "Wed Apr 20 08:11:29 MDT 2005", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://csdl.computer.org/comp/mags/mi/2005/02/m2010abs.htm; http://csdl.computer.org/dl/mags/mi/2005/02/m2010.pdf", acknowledgement = ack-nhfb, fjournal = "IEEE Micro", journal-URL = "http://www.computer.org/csdl/mags/mi/index.html", } @Article{Mudigonda:2005:MMA, author = "Jayaram Mudigonda and Harrick M. Vin and Raj Yavatkar", title = "Managing memory access latency in packet processing", journal = j-SIGMETRICS, volume = "33", number = "1", pages = "396--397", month = jun, year = "2005", CODEN = "????", DOI = "https://doi.org/10.1145/1064212.1064272", ISSN = "0163-5999 (print), 1557-9484 (electronic)", ISSN-L = "0163-5999", bibdate = "Fri Jun 27 09:21:27 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "In this study, we refute the popular belief [1,2] that packet processing does not benefit from data-caching. We show that a small data-cache of 8KB can bring down the packet processing time by much as 50-90\%, while reducing the off-chip memory bandwidth usage by about 60-95\%. We also show that, unlike general-purpose computing, packet processing, due to its memory-intensive nature, cannot rely exclusively on data-caching to eliminate the memory bottleneck completely.", acknowledgement = ack-nhfb, fjournal = "ACM SIGMETRICS Performance Evaluation Review", journal-URL = "http://portal.acm.org/toc.cfm?id=J618", keywords = "data-caches; multithreading; network processors", } @Article{Petric:2005:EEP, author = "Vlad Petric and Amir Roth", title = "Energy-Effectiveness of Pre-Execution and Energy-Aware {P}-Thread Selection", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "322--333", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Ruan:2005:EIS, author = "Yaoping Ruan and Vivek S. Pai and Erich Nahum and John M. Tracey", title = "Evaluating the impact of simultaneous multithreading on network servers using real hardware", journal = j-SIGMETRICS, volume = "33", number = "1", pages = "315--326", month = jun, year = "2005", CODEN = "????", DOI = "https://doi.org/10.1145/1071690.1064254", ISSN = "0163-5999 (print), 1557-9484 (electronic)", ISSN-L = "0163-5999", bibdate = "Fri Jun 27 09:21:27 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This paper examines the performance of simultaneous multithreading (SMT) for network servers using actual hardware, multiple network server applications, and several workloads. Using three versions of the Intel Xeon processor with Hyper-Threading, we perform macroscopic analysis as well as microarchitectural measurements to understand the origins of the performance bottlenecks for SMT processors in these environments. The results of our evaluation suggest that the current SMT support in the Xeon is application and workload sensitive, and may not yield significant benefits for network servers. In general, we find that enabling SMT on real hardware usually produces only slight performance gains, and can sometimes lead to performance loss. In the uniprocessor case, previous studies appear to have neglected the OS overhead in switching from a uniprocessor kernel to an SMT-enabled kernel. The performance loss associated with such support is comparable to the gains provided by SMT. In the 2-way multiprocessor case, the higher number of memory references from SMT often causes the memory system to become the bottleneck, offsetting any processor utilization gains. This effect is compounded by the growing gap between processor speeds and memory latency. In trying to understand the large gains shown by simulation studies, we find that while the general trends for microarchitectural behavior agree with real hardware, differences in sizing assumptions and performance models yield much more optimistic benefits for SMT than we observe.", acknowledgement = ack-nhfb, fjournal = "ACM SIGMETRICS Performance Evaluation Review", journal-URL = "http://portal.acm.org/toc.cfm?id=J618", keywords = "network server; simultaneous multithreading(SMT)", } @Article{Rufai:2005:MPO, author = "Raimi Rufai and Muslim Bozyigit and Jaralla Alghamdi and Moataz Ahmed", title = "Multithreaded Parallelism with {OpenMP}", journal = j-PARALLEL-PROCESS-LETT, volume = "15", number = "4", pages = "367--378", month = dec, year = "2005", CODEN = "PPLTEE", DOI = "https://doi.org/10.1142/S0129626405002283", ISSN = "0129-6264 (print), 1793-642X (electronic)", bibdate = "Thu Sep 2 09:08:11 MDT 2010", bibsource = "http://ejournals.wspc.com.sg/ppl/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/parallelprocesslett.bib; https://www.math.utah.edu/pub/tex/bib/pvm.bib", acknowledgement = ack-nhfb, fjournal = "Parallel Processing Letters", journal-URL = "http://www.worldscientific.com/loi/ppl", } @Article{Sendag:2005:IIS, author = "Resit Sendag and Ying Chen and David J. Lilja", title = "The Impact of Incorrectly Speculated Memory Operations in a Multithreaded Architecture", journal = j-IEEE-TRANS-PAR-DIST-SYS, volume = "16", number = "3", pages = "271--285", month = mar, year = "2005", CODEN = "ITDSEO", DOI = "https://doi.org/10.1109/TPDS.2005.36", ISSN = "1045-9219 (print), 1558-2183 (electronic)", ISSN-L = "1045-9219", bibdate = "Thu Nov 10 08:30:29 MST 2005", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Parallel and Distributed Systems", journal-URL = "http://www.computer.org/tpds/archives.htm", } @Article{Shinjo:2005:AEP, author = "Y. Shinjo and C. Pu", title = "Achieving efficiency and portability in systems software: a case study on {POSIX}-compliant multithreaded programs", journal = j-IEEE-TRANS-SOFTW-ENG, volume = "31", number = "9", pages = "785--800", month = sep, year = "2005", CODEN = "IESEDJ", DOI = "https://doi.org/10.1109/TSE.2005.98", ISSN = "0098-5589 (print), 1939-3520 (electronic)", ISSN-L = "0098-5589", bibdate = "Thu Feb 1 11:00:42 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranssoftweng2000.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/unix.bib", URL = "http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=1514446", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Software Engineering", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=32", } @Article{Stark:2005:FSV, author = "Robert F. St{\"a}rk", title = "Formal specification and verification of the {C\#} thread model", journal = j-THEOR-COMP-SCI, volume = "343", number = "3", pages = "482--508", day = "17", month = oct, year = "2005", CODEN = "TCSCDI", ISSN = "0304-3975 (print), 1879-2294 (electronic)", ISSN-L = "0304-3975", bibdate = "Tue Mar 29 06:48:50 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.sciencedirect.com/science/journal/03043975", abstract = "We present a high-level Abstract State Machine (ASM) model of C\# threads and the .NET memory model. We focus on purely managed, fully portable threading features of C\#. The sequential model interleaves the computation steps of the currently running threads and is suitable for uniprocessors. The parallel model addresses problems of true concurrency on multi-processor systems. The models provide a sound basis for the development of multi-threaded applications in C\#. The thread and memory models complete the abstract operational semantics of C\# in [B{\"o}rger et al. Theoret. Comput. Sci., to appear]. The main invariants of the thread model concerning locks, monitors and mutual exclusion are formally verified in the AsmTP system, an interactive proof assistant based on ASM logic.", acknowledgement = ack-nhfb, fjournal = "Theoretical Computer Science", journal-URL = "http://www.sciencedirect.com/science/journal/03043975", } @Article{Steinke:2005:NPF, author = "Robert Steinke and Micah Clark and Elihu McMahon", title = "A new pattern for flexible worker threads with in-place consumption message queues", journal = j-OPER-SYS-REV, volume = "39", number = "2", pages = "71--73", month = apr, year = "2005", CODEN = "OSRED8", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:43 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @Article{Sundell:2005:FLF, author = "H{\aa}kan Sundell and Philippas Tsigas", title = "Fast and lock-free concurrent priority queues for multi-thread systems", journal = j-J-PAR-DIST-COMP, volume = "65", number = "5", pages = "609--627", month = may, year = "2005", CODEN = "JPDCER", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Fri Jul 11 20:32:33 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.sciencedirect.com/science/journal/07437315", acknowledgement = ack-nhfb, fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315", } @Article{Tian:2005:PCT, author = "Xinmin Tian and Milind Girkar and Aart Bik and Hideki Saito", title = "Practical Compiler Techniques on Efficient Multithreaded Code Generation for {OpenMP} Programs", journal = j-COMP-J, volume = "48", number = "5", pages = "588--601", month = sep, year = "2005", CODEN = "CMPJA6", DOI = "https://doi.org/10.1093/comjnl/bxh109", ISSN = "0010-4620 (print), 1460-2067 (electronic)", ISSN-L = "0010-4620", bibdate = "Tue Nov 8 05:58:50 MST 2005", bibsource = "http://comjnl.oxfordjournals.org/content/vol48/issue5/index.dtl; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://comjnl.oxfordjournals.org/cgi/content/abstract/48/5/588; http://comjnl.oxfordjournals.org/cgi/reprint/48/5/588", acknowledgement = ack-nhfb, fjournal = "The Computer Journal", journal-URL = "http://comjnl.oxfordjournals.org/", } @Article{Vachharajani:2005:CMP, author = "Neil Vachharajani and Matthew Iyer and Chinmay Ashok and Manish Vachharajani and David I. August and Daniel Connors", title = "Chip multi-processor scalability for single-threaded applications", journal = j-COMP-ARCH-NEWS, volume = "33", number = "4", pages = "44--53", month = nov, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:08 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Abadi:2006:TSL, author = "Martin Abadi and Cormac Flanagan and Stephen N. Freund", title = "Types for safe locking: {Static} race detection for {Java}", journal = j-TOPLAS, volume = "28", number = "2", pages = "207--255", month = mar, year = "2006", CODEN = "ATPSDT", DOI = "https://doi.org/10.1145/1119479.1119480", ISSN = "0164-0925 (print), 1558-4593 (electronic)", ISSN-L = "0164-0925", bibdate = "Fri Mar 10 18:46:58 MST 2006", bibsource = "http://www.acm.org/pubs/contents/journals/toplas/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This article presents a static race-detection analysis for multithreaded shared-memory programs, focusing on the Java programming language. The analysis is based on a type system that captures many common synchronization patterns. It supports classes with internal synchronization, classes that require client-side synchronization, and thread-local classes. In order to demonstrate the effectiveness of the type system, we have implemented it in a checker and applied it to over 40,000 lines of hand-annotated Java code. We found a number of race conditions in the standard Java libraries and other test programs. The checker required fewer than 20 additional type annotations per 1,000 lines of code. This article also describes two improvements that facilitate checking much larger programs: an algorithm for annotation inference and a user interface that clarifies warnings generated by the checker. These extensions have enabled us to use the checker for identifying race conditions in large-scale software systems with up to 500,000 lines of code.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Programming Languages and Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783", } @TechReport{Aciicmez:2006:PSB, author = "Onur Acii{\c{c}}mez and {\c{C}}etin Kaya Ko{\c{c}} and Jean-Pierre Seifert", title = "On the Power of Simple Branch Prediction Analysis", type = "Technical report", institution = "School of EECS, Oregon State University", address = "Corvallis, OR 97331, USA", month = oct, year = "2006", bibdate = "Mon Nov 20 14:57:23 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2000.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://eprint.iacr.org/2006/351; http://eprint.iacr.org/2006/351.pdf", abstract = "Very recently, a new software side-channel attack, called Branch Prediction Analysis (BPA) attack, has been discovered and also demonstrated to be practically feasible on popular commodity PC platforms. While the above recent attack still had the flavor of a classical timing attack against RSA, where one uses many execution-time measurements under the same key in order to statistically amplify some small but key-dependent timing differences, we dramatically improve upon the former result. We prove that a carefully written spy-process running simultaneously with an RSA-process, is able to collect during one \emph{single} RSA signing execution almost all of the secret key bits. We call such an attack, analyzing the CPU's Branch Predictor states through spying on a single quasi-parallel computation process, a \emph{Simple Branch Prediction Analysis (SBPA)} attack --- sharply differentiating it from those one relying on statistical methods and requiring many computation measurements under the same key. The successful extraction of almost all secret key bits by our SBPA attack against an openSSL RSA implementation proves that the often recommended blinding or so called randomization techniques to protect RSA against side-channel attacks are, in the context of SBPA attacks, totally useless. Additional to that very crucial security implication, targeted at such implementations which are assumed to be at least statistically secure, our successful SBPA attack also bears another equally critical security implication. Namely, in the context of simple side-channel attacks, it is widely believed that equally balancing the operations after branches is a secure countermeasure against such simple attacks. Unfortunately, this is not true, as even such ``balanced branch'' implementations can be completely broken by our SBPA attacks. Moreover, despite sophisticated hardware-assisted partitioning methods such as memory protection, sandboxing or even virtualization, SBPA attacks empower an unprivileged process to successfully attack other processes running in parallel on the same processor. Thus, we conclude that SBPA attacks are much more dangerous than previously anticipated, as they obviously do not belong to the same category as pure timing attacks.", acknowledgement = ack-nhfb, keywords = "implementation / Branch Prediction; Modular Exponentiation; RSA; Side Channel Analysis; Simultaneous Multithreading; Trusted Computing", } @Article{Adl-Tabatabai:2006:CRS, author = "Ali-Reza Adl-Tabatabai and Brian T. Lewis and Vijay Menon and Brian R. Murphy and Bratin Saha and Tatiana Shpeisman", title = "Compiler and runtime support for efficient software transactional memory", journal = j-SIGPLAN, volume = "41", number = "6", pages = "26--37", month = jun, year = "2006", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1133981.1133985", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 10:42:48 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Programmers have traditionally used locks to synchronize concurrent access to shared data. Lock-based synchronization, however, has well-known pitfalls: using locks for fine-grain synchronization and composing code that already uses locks are both difficult and prone to deadlock. Transactional memory provides an alternate concurrency control mechanism that avoids these pitfalls and significantly eases concurrent programming. Transactional memory language constructs have recently been proposed as extensions to existing languages or included in new concurrent language specifications, opening the door for new compiler optimizations that target the overheads of transactional memory. This paper presents compiler and runtime optimizations for transactional memory language constructs. We present a high-performance software transactional memory system (STM) integrated into a managed runtime environment. Our system efficiently implements nested transactions that support both composition of transactions and partial roll back. Our JIT compiler is the first to optimize the overheads of STM, and we show novel techniques for enabling JIT optimizations on STM operations. We measure the performance of our optimizations on a 16-way SMP running multi-threaded transactional workloads. Our results show that these techniques enable transactional memory's performance to compete with that of well-tuned synchronization.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "code generation; compiler optimizations; locking; synchronization; transactional memory; virtual machines", } @Article{Agerwala:2006:SRC, author = "T. Agerwala and M. Gupta", title = "Systems research challenges: a scale-out perspective", journal = j-IBM-JRD, volume = "50", number = "2/3", pages = "173--??", month = mar # " \slash " # may, year = "2006", CODEN = "IBMJAE", ISSN = "0018-8646 (print), 2151-8556 (electronic)", ISSN-L = "0018-8646", bibdate = "Fri Feb 9 20:16:31 MST 2007", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.research.ibm.com/journal/", URL = "http://www.research.ibm.com/journal/rd/502/agerwala.html", abstract = "A scale-out system is a collection of interconnected, modular, low- cost computers that work as a single entity to cooperatively provide applications, systems resources, and data to users. The dominant programming model for such systems consists of message passing at the systems level and multithreading at the element level. Scale-out computers have traditionally been developed and deployed to provide levels of performance (throughput and parallel processing) beyond what was achievable by large shared-memory computers that utilized the fastest processors and the most expensive memory systems. Today, exploiting scale-out at all levels in systems is becoming imperative in order to overcome a fundamental discontinuity in the development of microprocessor technology caused by power dissipation. The pervasive use of greater levels of scale-out, on the other hand, creates its own challenges in architecture, programming, systems management, and reliability. This position paper identifies some of the important research problems that must be addressed in order to deal with the technology disruption and fully realize the opportunity offered by scale-out. Our examples are based on parallelism, but the challenges we identify apply to scale-out more generally.", acknowledgement = ack-nhfb, fjournal = "IBM Journal of Research and Development", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5288520", ordernumber = "G322-0247-00", } @Article{Bacon:2006:BFL, author = "D. F. Bacon and X. Shen", title = "Braids and fibers: Language constructs with architectural support for adaptive responses to memory latencies", journal = j-IBM-JRD, volume = "50", number = "2/3", pages = "209--??", month = mar # " \slash " # may, year = "2006", CODEN = "IBMJAE", ISSN = "0018-8646 (print), 2151-8556 (electronic)", ISSN-L = "0018-8646", bibdate = "Fri Feb 9 20:16:31 MST 2007", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.research.ibm.com/journal/", URL = "http://www.research.ibm.com/journal/rd/502/bacon.html", abstract = "As processor speeds continue to increase at a much higher rate than memory speeds, memory latencies may soon approach a thousand processor cycles. As a result, the flat memory model that was made practical by deeply pipelined superscalar processors with multilevel caches will no longer be tenable. The most common approach to this problem is multithreading; however, multithreading requires either abundant independent applications or well-parallelized monolithic applications, and neither is easy to come by. We present high-level programming constructs called braids and fibers. The programming constructs facilitate the creation of programs that are partially ordered, in which the partial orders can be used to support adaptive responses to memory access latencies. Braiding is simpler than parallelizing, while yielding many of the same benefits. We show how the programming constructs can be effectively supported with simple instruction set architecture extensions and microarchitectural enhancements. We have developed braided versions of a number of important algorithms. The braided code is easy to understand at the source level and can be translated into highly efficient instructions using our architecture extensions.", acknowledgement = ack-nhfb, fjournal = "IBM Journal of Research and Development", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5288520", ordernumber = "G322-0247-00", } @Article{Basile:2006:ARM, author = "Claudio Basile and Zbigniew Kalbarczyk and Ravishankar K. Iyer", title = "Active Replication of Multithreaded Applications", journal = j-IEEE-TRANS-PAR-DIST-SYS, volume = "17", number = "5", pages = "448--465", month = may, year = "2006", CODEN = "ITDSEO", DOI = "https://doi.org/10.1109/TPDS.2006.56", ISSN = "1045-9219 (print), 1558-2183 (electronic)", ISSN-L = "1045-9219", bibdate = "Thu Jul 3 14:26:49 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://csdl.computer.org/comp/trans/td/2006/05/l0448s.pdf", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Parallel and Distributed Systems", journal-URL = "http://www.computer.org/tpds/archives.htm", } @Article{Blundell:2006:AGT, author = "Colin Blundell and Dimitra Giannakopoulou and Corina S. P{\u{a}}s{\u{a}}reanu", title = "Assume-guarantee testing", journal = j-SIGSOFT, volume = "31", number = "2", pages = "1:1--1:??", month = mar, year = "2006", CODEN = "SFENDP", DOI = "https://doi.org/10.1145/1108768.1123060", ISSN = "0163-5948 (print), 1943-5843 (electronic)", ISSN-L = "0163-5948", bibdate = "Wed Aug 1 17:15:15 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigsoft2000.bib", abstract = "Verification techniques for component-based systems should ideally be able to predict properties of the assembled system through analysis of individual components before assembly. This work introduces such a modular technique in the context of testing. Assume-guarantee testing relies on the (automated) decomposition of key system-level requirements into local component requirements at design time. Developers can verify the local requirements by checking components in isolation; failed checks may indicate violations of system requirements, while valid traces from different components compose via the assume-guarantee proof rule to potentially provide system coverage. These local requirements also form the foundation of a technique for efficient predictive testing of assembled systems: given a correct system run, this technique can predict violations by alternative system runs without constructing those runs. We discuss the application of our approach to testing a multi-threaded NASA application, where we treat threads as components.", acknowledgement = ack-nhfb, articleno = "1", fjournal = "ACM SIGSOFT Software Engineering Notes", journal-URL = "https://dl.acm.org/citation.cfm?id=J728", } @Article{Blundell:2006:STM, author = "C. Blundell and E. C. Lewis and M. M. K. Martin", title = "Subtleties of transactional memory atomicity semantics", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "5", number = "2", pages = "17--17", month = feb, year = "2006", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2006.18", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Transactional memory has great potential for simplifying multithreaded programming by allowing programmers to specify regions of the program that must appear to execute atomically. Transactional memory implementations then optimistically execute these transactions concurrently to obtain high performance. This work shows that the same atomic guarantees that give transactions their power also have unexpected and potentially serious negative effects on programs that were written assuming narrower scopes of atomicity. We make four contributions: (1) we show that a direct translation of lock-based critical sections into transactions can introduce deadlock into otherwise correct programs, (2) we introduce the terms strong atomicity and weak atomicity to describe the interaction of transactional and non-transactional code, (3) we show that code that is correct under weak atomicity can deadlock under strong atomicity, and (4) we demonstrate that sequentially composing transactional code can also introduce deadlocks. These observations invalidate the intuition that transactions are strictly safer than lock-based critical sections, that strong atomicity is strictly safer than weak atomicity, and that transactions are always composable", acknowledgement = ack-nhfb, fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Computer languages; Computer Systems Organization; Concurrent distributed and parallel languages; deadlock; direct translation; Hardware; Information science; Interference; Interleaved codes; Language Classifications; Law; lock-based critical sections; Multi-core/single-chip multiprocessors; multi-threading; Multiple Data Stream Architectures (Multiprocessors); multithreaded programming; nontransactional code; operating systems (computers); Parallel Architectures; Processor Architectures; program verification; Programming Languages; Programming profession; sequentially composing transactional code; Software performance; Software/Software Engineering; strong atomicity; System recovery; Transaction databases; transaction processing; transactional memory atomicity semantics; weak atomicity", } @Article{Bracy:2006:DAC, author = "A. Bracy and K. Doshi and Q. Jacobson", title = "Disintermediated Active Communication", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "5", number = "2", pages = "15--15", month = feb, year = "2006", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2006.15", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Disintermediated active communication (DAC) is a new paradigm of communication in which a sending thread actively engages a receiving thread when sending it a message via shared memory. DAC is different than existing approaches that use passive communication through shared-memory --- based on intermittently checking for messages --- or that use preemptive communication but must rely on intermediaries such as the operating system or dedicated interrupt channels. An implementation of DAC builds on existing cache coherency support and exploits light-weight user-level interrupts. Inter-thread communication occurs via monitored memory locations where the receiver thread responds to invalidations of monitored addresses with a light-weight user-level software-defined handler. Address monitoring is supported by cache line user-bits, or CLUbits. CLUbits reside in the cache next to the coherence state, are private per thread, and maintain user-defined per-cache-line state. A light weight software library can demultiplex asynchronous notifications and handle exceptional cases. In DAC-based programs threads coordinate with one another by explicit signaling and implicit resource monitoring. With the simple and direct communication primitives of DAC, multi-threaded workloads synchronize at a finer granularity and more efficiently utilize the hardware of upcoming multi-core designs. This paper introduces DAC, presents several signaling models for DAC-based programs, and describes a simple memory-based framework that supports DAC by leveraging existing cache-coherency models. Our framework is general enough to support uses beyond DAC", acknowledgement = ack-nhfb, fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "address monitoring; cache coherency; cache line user-bits; cache storage; CLUbits; Computer aided instruction; Concurrent computing; disintermediated active communication; Hardware; High performance computing; interrupts; interthread communication; memory locations; Monitoring; multi-threading; multicore designs; Operating systems; Processor scheduling; Programming profession; resource monitoring; shared memory; shared memory systems; signaling models; software libraries; Software libraries; software library; storage allocation; user-level interrupts", } @Article{Brzuszek:2006:MTS, author = "Marcin Brzuszek and Andrzej Daniluk", title = "Multithreaded transactions in scientific computing: New versions of a computer program for kinematical calculations of {RHEED} intensity oscillations", journal = j-COMP-PHYS-COMM, volume = "175", number = "10", pages = "678--681", day = "15", month = nov, year = "2006", CODEN = "CPHCBZ", DOI = "https://doi.org/10.1016/j.cpc.2006.06.013", ISSN = "0010-4655 (print), 1879-2944 (electronic)", ISSN-L = "0010-4655", bibdate = "Mon Feb 13 23:42:10 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/compphyscomm2000.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.sciencedirect.com/science/article/pii/S0010465506002979", acknowledgement = ack-nhfb, fjournal = "Computer Physics Communications", journal-URL = "http://www.sciencedirect.com/science/journal/00104655", } @Article{Cerin:2006:MSS, author = "Christophe C{\'e}rin and Jean-Luc Gaudiot and Michel Koskas", title = "A Multithreaded {SQL} Service", journal = j-PARALLEL-PROCESS-LETT, volume = "16", number = "2", pages = "245--259", month = jun, year = "2006", CODEN = "PPLTEE", DOI = "https://doi.org/10.1142/S0129626406002605", ISSN = "0129-6264 (print), 1793-642X (electronic)", bibdate = "Thu Sep 2 09:08:11 MDT 2010", bibsource = "http://ejournals.wspc.com.sg/ppl/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/parallelprocesslett.bib", acknowledgement = ack-nhfb, fjournal = "Parallel Processing Letters", journal-URL = "http://www.worldscientific.com/loi/ppl", } @Article{Chakraborty:2006:CSE, author = "Koushik Chakraborty and Philip M. Wells and Gurindar S. Sohi", title = "Computation spreading: employing hardware migration to specialize {CMP} cores on-the-fly", journal = j-SIGPLAN, volume = "41", number = "11", pages = "283--292", month = nov, year = "2006", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1168919.1168893", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 10:49:40 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "In canonical parallel processing, the operating system (OS) assigns a processing core to a single thread from a multithreaded server application. Since different threads from the same application often carry out similar computation, albeit at different times, we observe extensive code reuse among different processors, causing redundancy (e.g., in our server workloads, 45-65\% of all instruction blocks are accessed by all processors). Moreover, largely independent fragments of computation compete for the same private resources causing destructive interference. Together, this redundancy and interference lead to poor utilization of private microarchitecture resources such as caches and branch predictors. We present Computation Spreading (CSP), which employs hardware migration to distribute a thread's dissimilar fragments of computation across the multiple processing cores of a chip multiprocessor (CMP), while grouping similar computation fragments from different threads together. This paper focuses on a specific example of CSP for OS intensive server applications: separating application level (user) computation from the OS calls it makes. When performing CSP, each core becomes temporally specialized to execute certain computation fragments, and the same core is repeatedly used for such fragments. We examine two specific thread assignment policies for CSP, and show that these policies, across four server workloads, are able to reduce instruction misses in private L2 caches by 27-58\%, private L2 load misses by 0-19\%, and branch mispredictions by 9-25\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "cache locality; dynamic specialization", } @Article{Chuang:2006:UPB, author = "Weihaw Chuang and Satish Narayanasamy and Ganesh Venkatesh and Jack Sampson and Michael {Van Biesbrouck} and Gilles Pokam and Brad Calder and Osvaldo Colavin", title = "Unbounded page-based transactional memory", journal = j-SIGPLAN, volume = "41", number = "11", pages = "347--358", month = nov, year = "2006", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1168918.1168901", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 10:49:40 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Exploiting thread level parallelism is paramount in the multicore era. Transactions enable programmers to expose such parallelism by greatly simplifying the multi-threaded programming model. Virtualized transactions (unbounded in space and time) are desirable, as they can increase the scope of transactions' use, and thereby further simplify a programmer's job. However, hardware support is essential to support efficient execution of unbounded transactions. In this paper, we introduce Page-based Transactional Memory to support unbounded transactions. We combine transaction bookkeeping with the virtual memory system to support fast transaction conflict detection, commit, abort, and to maintain transactions' speculative data.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "concurrency; parallel programming; transactional memory; transactions; virtual memory", } @Article{Ergin:2006:ENV, author = "O. Ergin and O. Unsal and X. Vera and A. Gonzalez", title = "Exploiting Narrow Values for Soft Error Tolerance", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "5", number = "2", pages = "12--12", month = feb, year = "2006", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2006.12", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Soft errors are an important challenge in contemporary microprocessors. Particle hits on the components of a processor are expected to create an increasing number of transient errors with each new microprocessor generation. In this paper we propose simple mechanisms that effectively reduce the vulnerability to soft errors In a processor. Our designs are generally motivated by the fact that many of the produced and consumed values in the processors are narrow and their upper order bits are meaningless. Soft errors canted by any particle strike to these higher order bits can be avoided by simply identifying these narrow values. Alternatively soft errors can be detected or corrected on the narrow values by replicating the vulnerable portion of the value inside the storage space provided for the upper order bits of these operands. We offer a variety of schemes that make use of narrow values and analyze their efficiency in reducing soft error vulnerability of level-1 data cache of the processor", acknowledgement = ack-nhfb, fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "cache storage; Cache storage; contemporary microprocessors; data cache; Data Cache; Error correction; error correction; Error Correction; error correction; error detection; Hardware; Impurities; Manufacturing; microprocessor chips; Microprocessors; Multithreading; Narrow Values; narrow values; Neutrons; particle strike; Process design; radiation effects; Random access memory; soft error tolerance; Soft Errors; system recovery; transient errors; transients", } @Article{Factor:2006:PID, author = "Michael Factor and Assaf Schuster and Konstantin Shagin", title = "A Platform-Independent Distributed Runtime for Standard Multithreaded {Java}", journal = j-INT-J-PARALLEL-PROG, volume = "34", number = "2", pages = "113--142", month = apr, year = "2006", CODEN = "IJPPE5", DOI = "https://doi.org/10.1007/s10766-006-0007-0", ISSN = "0885-7458 (print), 1573-7640 (electronic)", ISSN-L = "0885-7458", bibdate = "Wed Jul 9 16:05:55 MDT 2008", bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=34&issue=2; https://www.math.utah.edu/pub/tex/bib/java2000.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=34&issue=2&spage=113", acknowledgement = ack-nhfb, fjournal = "International Journal of Parallel Programming", journal-URL = "http://link.springer.com/journal/10766", keywords = "bytecode instrumentation; distributed computing; distributed shared memory; Java", } @Article{Gomez:2006:SCM, author = "Juan Carlos Gomez and Vernon Rego and V. S. Sunderam", title = "Scheduling communication in multithreaded programs: experimental results", journal = j-CCPE, volume = "18", number = "1", pages = "1--28", month = jan, year = "2006", CODEN = "CCPEBO", DOI = "https://doi.org/10.1002/cpe.904", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Mon Dec 5 10:08:00 MST 2011", bibsource = "http://www.interscience.wiley.com/jpages/1532-0626; https://www.math.utah.edu/pub/tex/bib/ccpe.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Concurrency and Computation: Prac\-tice and Experience", journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626", onlinedate = "13 Sep 2005", } @Article{Gomez:2006:STC, author = "Juan Carlos Gomez and Jorge R. Ramos and Vernon Rego", title = "Signals, timers, and continuations for multithreaded user-level protocols", journal = j-SPE, volume = "36", number = "5", pages = "449--471", day = "25", month = apr, year = "2006", CODEN = "SPEXBL", DOI = "https://doi.org/10.1002/spe.700", ISSN = "0038-0644 (print), 1097-024X (electronic)", ISSN-L = "0038-0644", bibdate = "Wed Oct 17 18:33:12 MDT 2007", bibsource = "http://www.interscience.wiley.com/jpages/0038-0644; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www3.interscience.wiley.com/journalfinder.html", acknowledgement = ack-nhfb, fjournal = "Software---Practice and Experience", journal-URL = "http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1097-024X", onlinedate = "19 Jan 2006", } @Article{Grelck:2006:SFA, author = "Clemens Grelck and Sven-Bodo Scholz", title = "{SAC} --- a Functional Array Language for Efficient Multi-threaded Execution", journal = j-INT-J-PARALLEL-PROG, volume = "34", number = "4", pages = "383--427", month = aug, year = "2006", CODEN = "IJPPE5", DOI = "https://doi.org/10.1007/s10766-006-0018-x", ISSN = "0885-7458 (print), 1573-7640 (electronic)", ISSN-L = "0885-7458", bibdate = "Wed Jul 9 16:06:07 MDT 2008", bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=34&issue=4; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=34&issue=4&spage=383", acknowledgement = ack-nhfb, fjournal = "International Journal of Parallel Programming", journal-URL = "http://link.springer.com/journal/10766", keywords = "Compiler optimisation; data parallel programming; multi-threading; Single Assignment C", } @Article{Kaiser:2006:CJC, author = "Claude Kaiser and Jean-Fran{\c{c}}ois Pradat-Peyre and Sami {\'E}vangelista and Pierre Rousseau", title = "Comparing {Java}, {C\#} and {Ada} monitors queuing policies: a case study and its {Ada} refinement", journal = j-SIGADA-LETTERS, volume = "26", number = "2", pages = "23--37", month = aug, year = "2006", CODEN = "AALEE5", DOI = "https://doi.org/10.1145/1165678.1165681", ISSN = "1094-3641 (print), 1557-9476 (electronic)", ISSN-L = "1094-3641", bibdate = "Tue Jun 17 09:16:14 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Learning concurrency paradigms is necessary but it is not sufficient since the choice of run-time semantics may introduce subtle programming errors. It is the aim of this paper to exemplify the importance of process queuing and awaking policies resulting from possible choices of the monitor concept implementation.The first part of the paper compares the behaviour of concurrent processes sharing a unique waiting queue for condition synchronization when implemented in Java or in Ada. A particular solution of the dining philosophers paradigm will be used to show how the difference in the monitor semantics may lead or not to deadlock. This comparison provides insight for deriving a correct Java implementation. The second part of the paper shows how the implementation can be refined when using Ada entry families and requeue with requeue once restriction. The result is elegant, safe and fair, and deterministic. This paper ends with quantitative comparisons of concurrency complexity and of concurrency effectiveness.We conclude that Java and C\# multithreading need defensive concurrent programming while Ada allows more latitude for developing correct concurrent programs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGAda Ada Letters", } @Article{Kim:2006:ERI, author = "Seon Wook Kim and Chong-Liang Ooi and Rudolf Eigenmann and Babak Falsafi and T. N. Vijaykumar", title = "Exploiting reference idempotency to reduce speculative storage overflow", journal = j-TOPLAS, volume = "28", number = "5", pages = "942--965", month = sep, year = "2006", CODEN = "ATPSDT", DOI = "https://doi.org/10.1145/1152649.1152653", ISSN = "0164-0925 (print), 1558-4593 (electronic)", ISSN-L = "0164-0925", bibdate = "Wed Sep 6 07:13:55 MDT 2006", bibsource = "http://www.acm.org/pubs/contents/journals/toplas/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Recent proposals for multithreaded architectures employ speculative execution to allow threads with unknown dependences to execute speculatively in parallel. The architectures use hardware speculative storage to buffer speculative data, track data dependences and correct incorrect executions through roll-backs. Because all memory references access the speculative storage, current proposals implement speculative storage using small memory structures to achieve fast access. The limited capacity of the speculative storage causes considerable performance loss due to speculative storage overflow whenever a thread's speculative state exceeds the speculative storage capacity. Larger threads exacerbate the overflow problem but are preferable to smaller threads, as larger threads uncover more parallelism. In this article, we discover a new program property called memory reference idempotency. Idempotent references are guaranteed to be eventually corrected, though the references may be temporarily incorrect in the process of speculation. Therefore, idempotent references, even from nonparallelizable program sections, need not be tracked in the speculative storage, and instead can directly access nonspeculative storage (i.e., conventional memory hierarchy). Thus, we reduce the demand for speculative storage space in large threads. We define a formal framework for reference idempotency and present a novel compiler-assisted speculative execution model. We prove the necessary and sufficient conditions for reference idempotency using our model. We present a compiler algorithm to label idempotent memory references for the hardware. Experimental results show that for our benchmarks, over 60\% of the references in nonparallelizable program sections are idempotent.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Programming Languages and Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783", } @Article{Kyriacou:2006:CCO, author = "Costas Kyriacou and Paraskevas Evripidou and Pedro Trancoso", title = "{CacheFlow}: Cache Optimizations for Data Driven Multithreading", journal = j-PARALLEL-PROCESS-LETT, volume = "16", number = "2", pages = "229--244", month = jun, year = "2006", CODEN = "PPLTEE", DOI = "https://doi.org/10.1142/S0129626406002599", ISSN = "0129-6264 (print), 1793-642X (electronic)", bibdate = "Thu Sep 2 09:08:11 MDT 2010", bibsource = "http://ejournals.wspc.com.sg/ppl/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/parallelprocesslett.bib", acknowledgement = ack-nhfb, fjournal = "Parallel Processing Letters", journal-URL = "http://www.worldscientific.com/loi/ppl", } @Article{Kyriacou:2006:DDM, author = "Costas Kyriacou and Paraskevas Evripidou and Pedro Trancoso", title = "Data-Driven Multithreading Using Conventional Microprocessors", journal = j-IEEE-TRANS-PAR-DIST-SYS, volume = "17", number = "10", pages = "1176--1188", month = oct, year = "2006", CODEN = "ITDSEO", DOI = "https://doi.org/10.1109/TPDS.2006.136", ISSN = "1045-9219 (print), 1558-2183 (electronic)", ISSN-L = "1045-9219", bibdate = "Thu Jul 3 14:26:50 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Parallel and Distributed Systems", journal-URL = "http://www.computer.org/tpds/archives.htm", } @TechReport{Lee:2006:PT, author = "Edward A. Lee", title = "The Problem with Threads", type = "Technical Report", number = "UCB/EECS-2006-1", institution = "Electrical Engineering and Computer Sciences. University of California at Berkeley", address = "Berkeley, CA, USA", day = "10", month = jan, year = "2006", bibdate = "Thu Oct 23 15:07:59 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.eecs.berkeley.edu/Pubs/TechRpts/2006/EECS-2006-1.html", abstract = "Threads are a seemingly straightforward adaptation of the dominant sequential model of computation to concurrent systems. Languages require little or no syntactic changes to support threads, and operating systems and architectures have evolved to efficiently support them. Many technologists are pushing for increased use of multithreading in software in order to take advantage of the predicted increases in parallelism in computer architectures. In this paper, I argue that this is not a good idea. Although threads seem to be a small step from sequential computation, in fact, they represent a huge step. They discard the most essential and appealing properties of sequential computation: understandability, predictability, and determinism. Threads, as a model of computation, are wildly nondeterministic, and the job of the programmer becomes one of pruning that nondeterminism. Although many research techniques improve the model by offering more effective pruning, I argue that this is approaching the problem backwards. Rather than pruning nondeterminism, we should build from essentially deterministic, composable components. Nondeterminism should be explicitly and judiciously introduced where needed, rather than removed where not needed. The consequences of this principle are profound. I argue for the development of concurrent coordination languages based on sound, composable formalisms. I believe that such languages will yield much more reliable, and more concurrent programs.", acknowledgement = ack-nhfb, } @Article{Lee:2006:TBR, author = "S.-W. Lee and J.-L. Gaudiot", title = "Throttling-Based Resource Management in High Performance Multithreaded Architectures", journal = j-IEEE-TRANS-COMPUT, volume = "55", number = "9", pages = "1142--1152", month = sep, year = "2006", CODEN = "ITCOB4", DOI = "https://doi.org/10.1109/TC.2006.154", ISSN = "0018-9340 (print), 1557-9956 (electronic)", ISSN-L = "0018-9340", bibdate = "Mon Jul 4 15:35:56 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2000.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=1668042", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Computers", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12", } @Article{Li:2006:MEMa, author = "Xin Li and Marian Boldt and Reinhard von Hanxleden", title = "Mapping {Esterel} onto a multi-threaded embedded processor", journal = j-COMP-ARCH-NEWS, volume = "34", number = "5", pages = "303--314", month = dec, year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Oct 27 06:18:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Li:2006:MEMb, author = "Xin Li and Marian Boldt and Reinhard von Hanxleden", title = "Mapping {Esterel} onto a multi-threaded embedded processor", journal = j-OPER-SYS-REV, volume = "40", number = "5", pages = "303--314", month = dec, year = "2006", CODEN = "OSRED8", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Fri Oct 27 06:18:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGOPS Operating Systems Review", } @Article{Li:2006:MEMc, author = "Xin Li and Marian Boldt and Reinhard von Hanxleden", title = "Mapping {Esterel} onto a multi-threaded embedded processor", journal = j-SIGPLAN, volume = "41", number = "11", pages = "303--314", month = nov, year = "2006", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1168857.1168896", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 10:49:40 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The synchronous language Esterel is well-suited for programming control-dominated reactive systems at the system level. It provides non-traditional control structures, in particular concurrency and various forms of preemption, which allow to concisely express reactive behavior. As these control structures cannot be mapped easily onto traditional, sequential processors, an alternative approach that has emerged recently makes use of special-purpose reactive processors. However, the designs proposed so far have limitations regarding completeness of the language support, and did not really take advantage of compile-time knowledge to optimize resource usage. This paper presents a reactive processor, the Kiel Esterel Processor 3a (KEP3a), and its compiler. The KEP3a improves on earlier designs in several areas; most notable are the support for exception handling and the provision of context-dependent preemption handling instructions. The KEP3a compiler presented here is to our knowledge the first for multi-threaded reactive processors. The translation of Esterel's preemption constructs onto KEP3a assembler is straightforward; however, a challenge is the correct and efficient representation of Esterel's concurrency. The compiler generates code that respects data and control dependencies using the KEP3a priority-based scheduling mechanism. We present a priority assignment approach that makes use of a novel concurrent control flow graph and has a complexity that in practice tends to be linear in the size of the program. Unlike earlier Esterel compilation schemes, this approach avoids unnecessary context switches by considering each thread's actual execution state at run time. Furthermore, it avoids code replication present in other approaches.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "concurrency; Esterel; low-power processing; multi-threading; reactive systems", } @Article{Li:2006:SDH, author = "Tong Li and Alvin R. Lebeck and Daniel J. Sorin", title = "Spin Detection Hardware for Improved Management of Multithreaded Systems", journal = j-IEEE-TRANS-PAR-DIST-SYS, volume = "17", number = "6", pages = "508--521", month = jun, year = "2006", CODEN = "ITDSEO", DOI = "https://doi.org/10.1109/TPDS.2006.78", ISSN = "1045-9219 (print), 1558-2183 (electronic)", ISSN-L = "1045-9219", bibdate = "Thu Jul 3 14:26:49 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Parallel and Distributed Systems", journal-URL = "http://www.computer.org/tpds/archives.htm", } @Article{Moon:2006:TMS, author = "Sewon Moon and Byeong-Mo Chang", title = "A thread monitoring system for multithreaded {Java} programs", journal = j-SIGPLAN, volume = "41", number = "5", pages = "21--29", month = may, year = "2006", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1149982.1149985", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 10:42:34 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "To assist developing robust multithreaded software, we develop a thread monitoring system for multithreaded Java programs, which can trace or monitor running threads and synchronization. We design a monitoring system which has options to select interesting threads and synchronized actions. Using this tool, programmers can monitor only interesting threads and synchronization in more details by selecting options, and can detect a deadlock. It also provides profile information after execution, which summarizes behavior of running threads and synchronized actions during execution. We implement the system based on code inlining, and presents some experimental results.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "Java; monitoring; synchronization; thread", } @Article{Morad:2006:PPE, author = "T. Y. Morad and U. C. Weiser and A. Kolodnyt and M. Valero and E. Ayguade", title = "Performance, power efficiency and scalability of asymmetric cluster chip multiprocessors", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "5", number = "1", pages = "14--17", month = jan, year = "2006", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2006.6", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This paper evaluates asymmetric cluster chip multiprocessor (ACCMP) architectures as a mechanism to achieve the highest performance for a given power budget. ACCMPs execute serial phases of multithreaded programs on large high-performance cores whereas parallel phases are executed on a mix of large and many small simple cores. Theoretical analysis reveals a performance upper bound for symmetric multiprocessors, which is surpassed by asymmetric configurations at certain power ranges. Our emulations show that asymmetric multiprocessors can reduce power consumption by more than two thirds with similar performance compared to symmetric multiprocessors", acknowledgement = ack-nhfb, fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "ACCMP; Application software; asymmetric cluster chip multiprocessors; Chip Multiprocessors; Emulation; Frequency; microprocessor chips; multi-threading; multiprocessing systems; multithreaded program; Optimized production technology; Parallel processing; parallel processing; power consumption reduction; power efficiency; Power Efficiency; Power system modeling; Queueing analysis; Scalability; Upper bound; Voltage", } @Article{Naik:2006:ESR, author = "Mayur Naik and Alex Aiken and John Whaley", title = "Effective static race detection for {Java}", journal = j-SIGPLAN, volume = "41", number = "6", pages = "308--319", month = jun, year = "2006", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1133255.1134018", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 10:42:48 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "We present a novel technique for static race detection in Java programs, comprised of a series of stages that employ a combination of static analyses to successively reduce the pairs of memory accesses potentially involved in a race. We have implemented our technique and applied it to a suite of multi-threaded Java programs. Our experiments show that it is precise, scalable, and useful, reporting tens to hundreds of serious and previously unknown concurrency bugs in large, widely-used programs with few false alarms.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "concurrency; Java; multi-threading; static race detection; synchronization", } @Article{Nanda:2006:ISM, author = "Mangala Gowri Nanda and S. Ramesh", title = "Interprocedural slicing of multithreaded programs with applications to {Java}", journal = j-TOPLAS, volume = "28", number = "6", pages = "1088--1144", month = nov, year = "2006", CODEN = "ATPSDT", DOI = "https://doi.org/10.1145/1186632.1186636", ISSN = "0164-0925 (print), 1558-4593 (electronic)", ISSN-L = "0164-0925", bibdate = "Sat Apr 14 11:13:21 MDT 2007", bibsource = "http://www.acm.org/pubs/contents/journals/toplas/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Programming Languages and Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783", } @Article{Narayanasamy:2006:RSM, author = "Satish Narayanasamy and Cristiano Pereira and Brad Calder", title = "Recording shared memory dependencies using strata", journal = j-SIGPLAN, volume = "41", number = "11", pages = "229--240", month = nov, year = "2006", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1168857.1168886", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 10:49:40 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Significant time is spent by companies trying to reproduce and fix bugs. BugNet and FDR are recent architecture proposals that provide architecture support for deterministic replay debugging. They focus on continuously recording information about the program's execution, which can be communicated back to the developer. Using that information, the developer can deterministically replay the program's execution to reproduce and fix the bugs. In this paper, we propose using Strata to efficiently capture the shared memory dependencies. A stratum creates a time layer across all the logs for the running threads, which separates all the memory operations executed before and after the stratum. A strata log allows us to determine all the shared memory dependencies during replay and thereby supports deterministic replay debugging for multi-threaded programs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "debugging; dependencies; logging; replay; shared memory; strata", } @Article{Ottoni:2006:SPC, author = "G. Ottoni and R. Rangan and A. Stoler and M. J. Bridges and D. I. August", title = "From sequential programs to concurrent threads", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "5", number = "1", pages = "6--9", month = jan, year = "2006", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2006.5", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Chip multiprocessors are of increasing importance due to difficulties in achieving higher clock frequencies in uniprocessors, but their success depends on finding useful work for the processor cores. This paper addresses this challenge by presenting a simple compiler approach that extracts non-speculative thread-level parallelism from sequential codes. We present initial results from this technique targeting a validated dual-core processor model, achieving speedups ranging from 9-48\% with an average of 25\% for important benchmark loops over their single-threaded versions. We also identify important next steps found during our pursuit of higher degrees of automatic threading", acknowledgement = ack-nhfb, fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "automatic threading; Bridges; Clocks; Computer science; concurrency control; concurrent threads; Frequency; Hardware; Microprocessors; multi-threading; nonspeculative thread-level parallelism; Parallel processing; Pipeline processing; program compiler; program compilers; Program processors; sequential programs", } @Article{Parashar:2006:SSBa, author = "Angshuman Parashar and Anand Sivasubramaniam and Sudhanva Gurumurthi", title = "{SlicK}: slice-based locality exploitation for efficient redundant multithreading", journal = j-COMP-ARCH-NEWS, volume = "34", number = "5", pages = "95--105", month = dec, year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Oct 27 06:18:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Parashar:2006:SSBb, author = "Angshuman Parashar and Anand Sivasubramaniam and Sudhanva Gurumurthi", title = "{SlicK}: slice-based locality exploitation for efficient redundant multithreading", journal = j-OPER-SYS-REV, volume = "40", number = "5", pages = "95--105", month = dec, year = "2006", CODEN = "OSRED8", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Fri Oct 27 06:18:30 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGOPS Operating Systems Review", } @Article{Parashar:2006:SSBc, author = "Angshuman Parashar and Anand Sivasubramaniam and Sudhanva Gurumurthi", title = "{SlicK}: slice-based locality exploitation for efficient redundant multithreading", journal = j-SIGPLAN, volume = "41", number = "11", pages = "95--105", month = nov, year = "2006", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1168857.1168870", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 10:49:40 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Transient faults are expected a be a major design consideration in future microprocessors. Recent proposals for transient fault detection in processor cores have revolved around the idea of redundant threading, which involves redundant execution of a program across multiple execution contexts. This paper presents a new approach to redundant threading by bringing together the concepts of slice-level execution and value and control-flow locality into a novel partial redundant threading mechanism called SlicK .The purpose of redundant execution is to check the integrity of the outputs propagating out of the core (typically through stores). SlicK implements redundancy at the granularity of backward-slices of these output instructions and exploits value and control-flow locality to avoid redundantly executing slices that lead to predictable outputs, thereby avoiding redundant execution of a significant fraction of instructions while maintaining extremely low vulnerabilities for critical processor structures. We propose the microarchitecture of a backward-slice extractor called SliceEM that is able to identify backward slices without interrupting the instruction flow, and show how this extractor and a set of predictors can be integrated into a redundant threading mechanism to form SlicK. Detailed simulations with SPEC CPU2000 benchmarks show that SlicK can provide around 10.2\% performance improvement over a well known redundant threading mechanism, buying back over 50\% of the loss suffered due to redundant execution. SlicK can keep the Architectural Vulnerability Factors of processor structures to typically 0\%-2\%. More importantly, SlicK's slice-based mechanisms provide future opportunities for exploring interesting points in the performance-reliability design space based on market segment needs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "backward slice extraction; microarchitecture; redundant threading; transient faults", } @Article{Pickett:2006:SSF, author = "Christopher J. F. Pickett and Clark Verbrugge", title = "{SableSpMT}: a software framework for analysing speculative multithreading in {Java}", journal = j-SIGSOFT, volume = "31", number = "1", pages = "59--66", month = jan, year = "2006", CODEN = "SFENDP", DOI = "https://doi.org/10.1145/1108768.1108809", ISSN = "0163-5948 (print), 1943-5843 (electronic)", ISSN-L = "0163-5948", bibdate = "Wed Aug 1 17:15:12 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/java2000.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigsoft2000.bib", abstract = "Speculative multithreading (SpMT) is a promising optimisation technique for achieving faster execution of sequential programs on multiprocessor hardware. Analysis of and data acquisition from such systems is however difficult and complex, and is typically limited to a specific hardware design and simulation environment. We have implemented a flexible, software-based speculative multithreading architecture within the context of a full-featured Java virtual machine. We consider the entire Java language and provide a complete set of support features for speculative execution, including return value prediction. Using our system we are able to generate extensive dynamic analysis information, analyse the effects of runtime feedback, and determine the impact of incorporating static, offline information. Our approach allows for accurate analysis of Java SpMT on existing, commodity multiprocessor hardware, and provides a vehicle for further experimentation with speculative approaches and optimisations.", acknowledgement = ack-nhfb, fjournal = "ACM SIGSOFT Software Engineering Notes", journal-URL = "https://dl.acm.org/citation.cfm?id=J728", } @Article{Pratikakis:2006:LCS, author = "Polyvios Pratikakis and Jeffrey S. Foster and Michael Hicks", title = "{LOCKSMITH}: context-sensitive correlation analysis for race detection", journal = j-SIGPLAN, volume = "41", number = "6", pages = "320--331", month = jun, year = "2006", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1133255.1134019", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 10:42:48 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "One common technique for preventing data races in multi-threaded programs is to ensure that all accesses to shared locations are consistently protected by a lock. We present a tool called LOCKSMITH for detecting data races in C programs by looking for violations of this pattern. We call the relationship between locks and the locations they protect consistent correlation, and the core of our technique is a novel constraint-based analysis that infers consistent correlation context-sensitively, using the results to check that locations are properly guarded by locks. We present the core of our algorithm for a simple formal language \lambda$_>$ which we have proven sound, and discuss how we scale it up to an algorithm that aims to be sound for all of C. We develop several techniques to improve the precision and performance of the analysis, including a sharing analysis for inferring thread locality; existential quantification for modeling locks in data structures; and heuristics for modeling unsafe features of C such as type casts. When applied to several benchmarks, including multi-threaded servers and Linux device drivers, LOCKSMITH found several races while producing a modest number of false alarm.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "context-sensitivity; correlation; locksmith; multi-threaded programming; race detection; type inference", } @Article{Reddy:2006:UPB, author = "Vimal K. Reddy and Eric Rotenberg and Sailashri Parthasarathy", title = "Understanding prediction-based partial redundant threading for low-overhead, high- coverage fault tolerance", journal = j-SIGPLAN, volume = "41", number = "11", pages = "83--94", month = nov, year = "2006", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1168917.1168869", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 10:49:40 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Redundant threading architectures duplicate all instructions to detect and possibly recover from transient faults. Several lighter weight Partial Redundant Threading (PRT) architectures have been proposed recently. (i) Opportunistic Fault Tolerance duplicates instructions only during periods of poor single-thread performance. (ii) ReStore does not explicitly duplicate instructions and instead exploits mispredictions among highly confident branch predictions as symptoms of faults. (iii) Slipstream creates a reduced alternate thread by replacing many instructions with highly confident predictions. We explore PRT as a possible direction for achieving the fault tolerance of full duplication with the performance of single-thread execution. Opportunistic and ReStore yield partial coverage since they are restricted to using only partial duplication or only confident predictions, respectively. Previous analysis of Slipstream fault tolerance was cursory and concluded that only duplicated instructions are covered. In this paper, we attempt to better understand Slipstream's fault tolerance, conjecturing that the mixture of partial duplication and confident predictions actually closely approximates the coverage of full duplication. A thorough dissection of prediction scenarios confirms that faults in nearly 100\% of instructions are detectable. Fewer than 0.1\% of faulty instructions are not detectable due to coincident faults and mispredictions. Next we show that the current recovery implementation fails to leverage excellent detection capability, since recovery sometimes initiates belatedly, after already retiring a detected faulty instruction. We propose and evaluate a suite of simple microarchitectural alterations to recovery and checking. Using the best alterations, Slipstream can recover from faults in 99\% of instructions, compared to only 78\% of instructions without alterations. Both results are much higher than predicted by past research, which claims coverage for only duplicated instructions, or 65\% of instructions. On an 8-issue SMT processor, Slipstream performs within 1.3\% of single-thread execution whereas full duplication slows performance by 14\%. A key byproduct of this paper is a novel analysis framework in which every dynamic instruction is considered to be hypothetically faulty, thus not requiring explicit fault injection. Fault coverage is measured in terms of the fraction of candidate faulty instructions that are directly or indirectly detectable before.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "branch prediction; chip multiprocessor (CMP); redundant multithreading; simultaneous multithreading (SMT); slipstream processor; time redundancy; transient faults; value prediction", } @Article{Ro:2006:DEH, author = "Won W. Ro and Stephen P. Crago and Alvin M. Despain and Jean-Luc Gaudiot", title = "Design and evaluation of a hierarchical decoupled architecture", journal = j-J-SUPERCOMPUTING, volume = "38", number = "3", pages = "237--259", month = dec, year = "2006", CODEN = "JOSUED", DOI = "https://doi.org/10.1007/s11227-006-8321-2", ISSN = "0920-8542 (print), 1573-0484 (electronic)", ISSN-L = "0920-8542", bibdate = "Wed Jul 9 17:32:29 MDT 2008", bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=38&issue=3; https://www.math.utah.edu/pub/tex/bib/jsuper.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=38&issue=3&spage=237", acknowledgement = ack-nhfb, fjournal = "The Journal of Supercomputing", journal-URL = "http://link.springer.com/journal/11227", keywords = "Data prefetching; Decoupled architectures; Instruction level parallelism; Memory latency hiding; Multithreading; Parallel architecture; Speculative execution", } @Article{Russell:2006:ESRa, author = "Kenneth Russell and David Detlefs", title = "Eliminating synchronization-related atomic operations with biased locking and bulk rebiasing", journal = j-SIGPLAN, volume = "41", number = "10", pages = "263--272", month = oct, year = "2006", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1167515.1167496", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 10:47:35 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The Java{\TM} programming language contains built-in synchronization primitives for use in constructing multithreaded programs. Efficient implementation of these synchronization primitives is necessary in order to achieve high performance. Recent research [9, 12, 10, 3, 7] has focused on the run-time elimination of the atomic operations required to implement object monitor synchronization primitives. This paper describes a novel technique called store-free biased locking which eliminates all synchronization-related atomic operations on uncontended object monitors. The technique supports the bulk transfer of object ownership from one thread to another, and the selective disabling of the optimization where unprofitable, using epoch-based bulk rebiasing and revocation. It has been implemented in the production version of the Java HotSpot{\TM}VM and has yielded significant performance improvements on a range of benchmarks and applications. The technique is applicable to any virtual machine-based programming language implementation with mostly block-structured locking primitives.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "atomic; bias; Java; lock; monitor; optimization; rebias; reservation; revoke; synchronization", } @Article{Sen:2006:OEP, author = "Koushik Sen and Grigore Rosu and Gul Agha", title = "Online efficient predictive safety analysis of multithreaded programs", journal = j-INT-J-SOFTW-TOOLS-TECHNOL-TRANSFER, volume = "8", number = "3", pages = "248--260", month = jun, year = "2006", CODEN = "????", DOI = "https://doi.org/10.1007/s10009-005-0192-y", ISSN = "1433-2779 (print), 1433-2787 (electronic)", ISSN-L = "1433-2779", bibdate = "Wed Jul 9 18:12:21 MDT 2008", bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=1433-2779&volume=8&issue=3; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=1433-2779&volume=8&issue=3&spage=248", acknowledgement = ack-nhfb, fjournal = "International Journal on Software Tools for Technology Transfer: STTT", keywords = "JMPaX; Multithreaded analysis; Predictive analysis; Runtime monitoring; Vector clock", } @Article{Shin:2006:ADT, author = "Chulho Shin and Seong-Won Lee and Jean-Luc Gaudiot", title = "Adaptive dynamic thread scheduling for simultaneous multithreaded architectures with a detector thread", journal = j-J-PAR-DIST-COMP, volume = "66", number = "10", pages = "1304--1321", month = oct, year = "2006", CODEN = "JPDCER", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Fri Jul 11 20:32:35 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.sciencedirect.com/science/journal/07437315", acknowledgement = ack-nhfb, fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315", } @Article{Smith:2006:ITP, author = "Geoffrey Smith", title = "Improved typings for probabilistic noninterference in a multi-threaded language", journal = j-J-COMP-SECUR, volume = "14", number = "6", pages = "591--623", month = "????", year = "2006", CODEN = "JCSIET", DOI = "https://doi.org/10.3233/JCS-2006-14605", ISSN = "0926-227X (print), 1875-8924 (electronic)", ISSN-L = "0926-227X", bibdate = "Tue May 24 06:23:23 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/jcompsecur.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Journal of Computer Security", journal-URL = "http://content.iospress.com/journals/journal-of-computer-security", } @Article{Trancoso:2006:CCM, author = "Pedro Trancoso and Paraskevas Evripidou and Kyriakos Stavrou and Costas Kyriacou", title = "A Case for Chip Multiprocessors Based on the Data-Driven Multithreading Model", journal = j-INT-J-PARALLEL-PROG, volume = "34", number = "3", pages = "213--235", month = jun, year = "2006", CODEN = "IJPPE5", DOI = "https://doi.org/10.1007/s10766-006-0016-z", ISSN = "0885-7458 (print), 1573-7640 (electronic)", ISSN-L = "0885-7458", bibdate = "Wed Jul 9 16:05:59 MDT 2008", bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=34&issue=3; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=34&issue=3&spage=213", acknowledgement = ack-nhfb, fjournal = "International Journal of Parallel Programming", journal-URL = "http://link.springer.com/journal/10766", keywords = "Chip multiprocessor; data-driven execution; multithreading; parallel processing", } @Article{Vasconcelos:2006:TCM, author = "Vasco T. Vasconcelos and Simon J. Gay and Ant{\'o}nio Ravara", title = "Type checking a multithreaded functional language with session types", journal = j-THEOR-COMP-SCI, volume = "368", number = "1--2", pages = "64--87", day = "5", month = dec, year = "2006", CODEN = "TCSCDI", ISSN = "0304-3975 (print), 1879-2294 (electronic)", ISSN-L = "0304-3975", bibdate = "Tue Mar 29 08:55:29 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.sciencedirect.com/science/journal/03043975", acknowledgement = ack-nhfb, fjournal = "Theoretical Computer Science", journal-URL = "http://www.sciencedirect.com/science/journal/03043975", } @Article{Wang:2006:RAA, author = "L. Wang and S. D. Stoller", title = "Runtime analysis of atomicity for multithreaded programs", journal = j-IEEE-TRANS-SOFTW-ENG, volume = "32", number = "2", pages = "93--110", month = feb, year = "2006", CODEN = "IESEDJ", DOI = "https://doi.org/10.1109/TSE.2006.1599419", ISSN = "0098-5589 (print), 1939-3520 (electronic)", ISSN-L = "0098-5589", bibdate = "Thu Feb 1 11:00:42 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranssoftweng2000.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=1599419", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Software Engineering", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=32", } @Article{Xu:2006:RTR, author = "Min Xu and Mark D. Hill and Rastislav Bodik", title = "A regulated transitive reduction {(RTR)} for longer memory race recording", journal = j-SIGPLAN, volume = "41", number = "11", pages = "49--60", month = nov, year = "2006", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1168919.1168865", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 10:49:40 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Multithreaded deterministic replay has important applications in cyclic debugging, fault tolerance and intrusion analysis. Memory race recording is a key technology for multithreaded deterministic replay. In this paper, we considerably improve our previous always-on Flight Data Recorder (FDR) in four ways:\par \begin{itemize} \item Longer recording by reducing the log size growth rate to approximately one byte per thousand dynamic instructions. \item Lower hardware cost by reducing the cost to 24 KB per processor core. \item Simpler design by modifying only the cache coherence protocol, but not the cache. \item Broader applicability by supporting both Sequential Consistency (SC) and Total Store Order (TSO) memory consistency models (existing recorders support only SC). \end{itemize} These improvements stem from several ideas: (1) a Regulated Transitive Reduction (RTR) recording algorithm that creates stricter and vectorizable dependencies to reduce the log growth rate; (2) a Set/LRU timestamp approximation method that better approximates timestamps of uncached memory locations to reduce the hardware cost; (3) an order-value-hybrid recording method that explicitly logs the value of potential SC-violating load instructions to support multiprocessor systems with TSO.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "determinism; multithreading; race recording", } @Article{Ziarek:2006:SMC, author = "Lukasz Ziarek and Philip Schatz and Suresh Jagannathan", title = "Stabilizers: a modular checkpointing abstraction for concurrent functional programs", journal = j-SIGPLAN, volume = "41", number = "9", pages = "136--147", month = sep, year = "2006", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1160074.1159822", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 10:46:22 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Transient faults that arise in large-scale software systems can often be repaired by re-executing the code in which they occur. Ascribing a meaningful semantics for safe re-execution in multi-threaded code is not obvious, however. For a thread to correctly re-execute a region of code, it must ensure that all other threads which have witnessed its unwanted effects within that region are also reverted to a meaningful earlier state. If not done properly, data inconsistencies and other undesirable behavior may result. however, automatically determining what constitutes a consistent global checkpoint is not straightforward since thread interactions are a dynamic property of the program. In this paper, we present a safe and efficient checkpointing mechanism for Concurrent ML (CML) that can be used to recover from transient faults. We introduce a new linguistic abstraction called stabilizers that permits the specification of per-thread monitors and the restoration of globally consistent checkpoints. Safe global states are computed through lightweight monitoring of communication events among threads (e.g. message-passing operations or updates to shared variables). Our experimental results on several realistic, multithreaded, server-style CML applications, including a web server and a windowing toolkit, show that the overheads to use stabilizers are small, and lead us to conclude that they are a viable mechanism for defining safe checkpoints in concurrent functional programs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "checkpointing; concurrent ML; concurrent programming; error recovery; exception handling; transactions", } @Article{Benaya:2007:UTA, author = "Tamar Benaya and Ela Zur", title = "Understanding threads in an advanced {Java} course", journal = j-SIGCSE, volume = "39", number = "3", pages = "323--323", month = sep, year = "2007", CODEN = "SIGSD3", DOI = "https://doi.org/10.1145/1269900.1268890", ISSN = "0097-8418 (print), 2331-3927 (electronic)", ISSN-L = "0097-8418", bibdate = "Sat Nov 17 16:57:36 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigcse2000.bib", note = "Proceedings of the 12th Annual SIGCSE Conference on Innovation and Technology in Computer Science Education (ITiCSE'07).", abstract = "This poster describes difficulties in understanding threads in an Advanced Java course given at the Computer Science department of the Open University of Israel (OUI). We present a typical question which focuses on several aspects of multi-threaded programming given in an exam. We discuss the students' answers and point to typical misunderstandings of the topic.", acknowledgement = ack-nhfb, fjournal = "SIGCSE Bulletin (ACM Special Interest Group on Computer Science Education)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J688", } @Article{Benner:2007:SLS, author = "Peter Benner and Maribel Castillo and Rafael Mayo and Enrique S. Quintana-Ort{\'\i} and Gregorio Quintana-Ort{\'\i}", title = "Stabilizing large-scale generalized systems on parallel computers using multithreading and message-passing", journal = j-CCPE, volume = "19", number = "4", pages = "531--542", day = "25", month = mar, year = "2007", CODEN = "CCPEBO", DOI = "https://doi.org/10.1002/cpe.1148", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Mon Dec 5 10:08:11 MST 2011", bibsource = "http://www.interscience.wiley.com/jpages/1532-0626; https://www.math.utah.edu/pub/tex/bib/ccpe.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Concurrency and Computation: Prac\-tice and Experience", journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626", onlinedate = "12 Dec 2006", } @Article{Bergstra:2007:SCE, author = "J. A. Bergstra and C. A. Middelburg", title = "Synchronous cooperation for explicit multi-threading", journal = j-ACTA-INFO, volume = "44", number = "7--8", pages = "525--569", month = dec, year = "2007", CODEN = "AINFA2", DOI = "https://doi.org/10.1007/s00236-007-0057-9", ISSN = "0001-5903 (print), 1432-0525 (electronic)", ISSN-L = "0001-5903", bibdate = "Wed Jul 9 21:28:19 MDT 2008", bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0001-5903&volume=44&issue=7; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0001-5903&volume=44&issue=7&spage=525", acknowledgement = ack-nhfb, fjournal = "Acta Informatica", journal-URL = "http://www.springerlink.com/content/0001-5903", } @Article{Blundell:2007:MFC, author = "Colin Blundell and Joe Devietti and E. Christopher Lewis and Milo M. K. Martin", title = "Making the fast case common and the uncommon case simple in unbounded transactional memory", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "24--34", month = may, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1273440.1250667", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Hardware transactional memory has great potential to simplify the creation of correct and efficient multithreaded programs, allowing programmers to exploit more effectively the soon-to-be-ubiquitous multi-core designs. Several recent proposals have extended the original bounded transactional memory to unbounded transactional memory, a crucial step toward transactions becoming a general-purpose primitive. Unfortunately, supporting the concurrent execution of an unbounded number of unbounded transactions is challenging, and as a result, many proposed implementations are complex.\par This paper explores a different approach. First, we introduce the permissions-only cache to extend the bound at which transactions overflow to allow the fast, bounded case to be used as frequently as possible. Second, we propose OneTM to simplify the implementation of unbounded transactional memory by bounding the concurrency of transactions that overflow the cache. These mechanisms work synergistically to provide a simple and fast unbounded transactional memory system.\par The permissions-only cache efficiently maintains the coherence permissions --- but not data-for blocks read or written transactionally that have been evicted from the processor's caches. By holding coherence permissions for these blocks, the regular cache coherence protocol can be used to detect transactional conflicts using only a few bits of on-chip storage per overflowed cache block. OneTM allows only one overflowed transaction at a time, relying on the permissions-only cache to ensure that overflow is infrequent. We present two implementations. In OneTM-Serialized, an overflowed transaction simply stalls all other threads in the application.\par In OneTM-Concurrent, non-overflowed transactions and non-transactional code can execute concurrently with the overflowed transaction, providing more concurrency while retaining OneTM's core simplifying assumption.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", keywords = "concurrency; parallel programming; transactional memory; transactions", } @Article{Boehm:2007:MCC, author = "Hans Boehm and Bill Pugh and Doug Lea", title = "Multithreading in {C} and {C++}", journal = j-LOGIN, volume = "32", number = "1", pages = "??--??", month = feb, year = "2007", CODEN = "LOGNEM", ISSN = "1044-6397", ISSN-L = "1044-6397", bibdate = "Fri Dec 7 11:34:27 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/usenix2000.bib; https://www.usenix.org/publications/login", URL = "https://www.usenix.org/publications/login/february-2007-volume-32-number-1/multithreading-c-and-c", acknowledgement = ack-nhfb, fjournal = ";login: the USENIX Association newsletter", } @Article{Burckhardt:2007:CCC, author = "Sebastian Burckhardt and Rajeev Alur and Milo M. K. Martin", title = "{CheckFence}: checking consistency of concurrent data types on relaxed memory models", journal = j-SIGPLAN, volume = "42", number = "6", pages = "12--21", month = jun, year = "2007", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1250734.1250737", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 10:55:30 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Concurrency libraries can facilitate the development of multi-threaded programs by providing concurrent implementations of familiar data types such as queues or sets. There exist many optimized algorithms that can achieve superior performance on multiprocessors by allowing concurrent data accesses without using locks. Unfortunately, such algorithms can harbor subtle concurrency bugs. Moreover, they require memory ordering fences to function correctly on relaxed memory models.\par To address these difficulties, we propose a verification approach that can exhaustively check all concurrent executions of a given test program on a relaxed memory model and can verify that they are observationally equivalent to a sequential execution. Our CheckFence prototype automatically translates the C implementation code and the test program into a SAT formula, hands the latter to a standard SAT solver, and constructs counter example traces if there exist incorrect executions. Applying CheckFence to five previously published algorithms, we were able to (1) find several bugs (some not previously known), and (2) determine how to place memory ordering fences for relaxed memory models.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "concurrent data structures; lock-free synchronization; memory models; multi-threading; sequential consistency; shared-memory multiprocessors; software model checking", } @Article{Das:2007:FVT, author = "Dipankar Das and P. P. Chakrabarti and Rajeev Kumar", title = "Functional verification of task partitioning for multiprocessor embedded systems", journal = j-TODAES, volume = "12", number = "4", pages = "44:1--44:??", month = sep, year = "2007", CODEN = "ATASFO", DOI = "https://doi.org/10.1145/1278349.1278357", ISSN = "1084-4309 (print), 1557-7309 (electronic)", ISSN-L = "1084-4309", bibdate = "Thu Jun 12 18:09:35 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/todaes/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "With the advent of multiprocessor embedded platforms, application partitioning and mapping have gained primacy as a design step. The output of this design step is a multithreaded partitioned application where each thread is mapped to a processing element (processor or ASIC) in the multiprocessor platform. This partitioned application must be verified to be consistent with the native unpartitioned application. This verification task is called application (or task) partitioning verification. \par This work proposes a code-block-level containment-checking -based methodology for application partitioning verification. We use a UML-based code-block-level modeling language which is rich enough to model most designs. We formulate the application partitioning verification problem as a special case of the containment checking problem, which we call the complete containment checking problem. We propose a state space reduction technique specific to the containment checking, reachability analysis, and deadlock detection problems. We propose novel data structures and token propagation methodologies which enhance the efficiency of containment checking. We present an efficient containment checking algorithm for the application partitioning verification problem. We develop a containment checking tool called TraceMatch and present experimental results. We present a comparison of the state space reduction achieved by TraceMatch with that achieved by formal analysis and verification tools like Spin, PEP, PROD, and LoLA.", acknowledgement = ack-nhfb, articleno = "44", fjournal = "ACM Transactions on Design Automation of Electronic Systems (TODAES)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J776", keywords = "Containment checking; multiprocessor embedded systems; state space reduction; UML activity diagrams", } @Article{Dou:2007:CCM, author = "Jialin Dou and Marcelo Cintra", title = "A compiler cost model for speculative parallelization", journal = j-TACO, volume = "4", number = "2", pages = "12:1--12:??", month = jun, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1145/1250727.1250732", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jun 16 11:40:54 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Speculative parallelization is a technique that allows code sections that cannot be fully analyzed by the compiler to be aggressively executed in parallel. However, while speculative parallelization can potentially deliver significant speedups, several overheads associated with this technique can limit these speedups in practice. This paper proposes a novel compiler static cost model of speculative multithreaded execution that can be used to predict the resulting performance. This model attempts to predict the expected speedups, or slowdowns, of the candidate speculative sections based on the estimation of the combined runtime effects of various overheads, and taking into account the scheduling restrictions of most speculative execution environments. The model is based on estimating the likely execution duration of threads and considers all the possible permutations of these threads. This model also produces a quantitative estimate of the speedup, which is different from prior heuristics that only qualitatively estimate the benefits of speculative multithreaded execution. In previous work, a limited version of the framework was evaluated on a number of loops from a collection of SPEC benchmarks that suffer mainly from load imbalance and thread dispatch and commit overheads. In this work, an extended framework is also evaluated on loops that may suffer from data-dependence violations. Experimental results show that prediction accuracy is lower when loops with violations are included. Nevertheless, accuracy is still very high for a static model: the framework can identify, on average, 45\% of the loops that cause slowdowns and, on average, 96\% of the loops that lead to speedups; it predicts the speedups or slowdowns with an error of less than 20\% for an average of 28\% of the loops across the benchmarks and with an error of less than 50\% for an average of 80\% of the loops. Overall, the framework often outperforms, by as much as 25\%, a naive approach that attempts to speculatively parallelize all the loops considered, and is able to curb the large slowdowns caused in many cases by this naive approach.", acknowledgement = ack-nhfb, articleno = "12", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924", keywords = "speculative multithreading; speculative parallelization; thread-level speculation", } @Article{Elmas:2007:GRT, author = "Tayfun Elmas and Shaz Qadeer and Serdar Tasiran", title = "{Goldilocks}: a race and transaction-aware {Java} runtime", journal = j-SIGPLAN, volume = "42", number = "6", pages = "245--255", month = jun, year = "2007", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1273442.1250762", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 10:55:30 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Data races often result in unexpected and erroneous behavior. In addition to causing data corruption and leading programs to crash, the presence of data races complicates the semantics of an execution which might no longer be sequentially consistent. Motivated by these observations, we have designed and implemented a Java runtime system that monitors program executions and throws a DataRaceException when a data race is about to occur. Analogous to other runtime exceptions, the DataRaceException provides two key benefits. First, accesses causing race conditions are interrupted and handled before they cause errors that may be difficult to diagnose later. Second, if no DataRaceException is thrown in an execution, it is guaranteed to be sequentially consistent. This strong guarantee helps to rule out many concurrency-related possibilities as the cause of erroneous behavior. When a DataRaceException is caught, the operation, thread, or program causing it can be terminated gracefully. Alternatively, the DataRaceException can serve as a conflict-detection mechanism in optimistic uses of concurrency.\par We start with the definition of data-race-free executions in the Java memory model. We generalize this definition to executions that use transactions in addition to locks and volatile variables for synchronization. We present a precise and efficient algorithm for dynamically verifying that an execution is free of data races. This algorithm generalizes the Goldilocks algorithm for data-race detection by handling transactions and providing the ability to distinguish between read and write accesses. We have implemented our algorithm and the DataRaceException in the Kaffe Java Virtual Machine. We have evaluated our system on a variety of publicly available Java benchmarks and a few microbenchmarks that combine lock-based and transaction-based synchronization. Our experiments indicate that our implementation has reasonable overhead. Therefore, we believe that in addition to being a debugging tool, the DataRaceException may be a viable mechanism to enforce the safety of executions of multithreaded Java programs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "data-race detection; Java runtime; runtime monitoring; software transactions", } @Article{Emer:2007:STV, author = "Joel Emer and Mark D. Hill and Yale N. Patt and Joshua J. Yi and Derek Chiou and Resit Sendag", title = "Single-Threaded vs. Multithreaded: Where Should We Focus?", journal = j-IEEE-MICRO, volume = "27", number = "6", pages = "14--24", month = nov # "\slash " # dec, year = "2007", CODEN = "IEMIDZ", DOI = "https://doi.org/10.1109/MM.2007.109", ISSN = "0272-1732 (print), 1937-4143 (electronic)", ISSN-L = "0272-1732", bibdate = "Wed Jul 2 21:58:03 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeemicro.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Micro", journal-URL = "http://www.computer.org/csdl/mags/mi/index.html", } @Article{Emmi:2007:LA, author = "Michael Emmi and Jeffrey S. Fischer and Ranjit Jhala and Rupak Majumdar", title = "Lock allocation", journal = j-SIGPLAN, volume = "42", number = "1", pages = "291--296", month = jan, year = "2007", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1190216.1190260", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 10:53:14 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "We introduce lock allocation, an automatic technique that takes a multi-threaded program annotated with atomic sections (that must be executed atomically), and infers a lock assignment from global variables to locks and a lock instrumentation that determines where each lock should be acquired and released such that the resulting instrumented program is guaranteed to preserve atomicity and deadlock freedom (provided all shared state is accessed only within atomic sections). Our algorithm works in the presence of pointers and procedures, and sets up the lock allocation problem as a 0-1 ILP which minimizes the conflict cost between atomic sections while simultaneously minimizing the number of locks. We have implemented our algorithm for both C with pthreads and Java, and have applied it to infer locks in 15K lines of AOLserver code. Our automatic allocation produces the same results as hand annotations for most of this code, while solving the optimization instances within a second for most programs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "atomicity; ILP; lock inference", } @Article{Eytani:2007:TFB, author = "Yaniv Eytani and Klaus Havelund and Scott D. Stoller and Shmuel Ur", title = "Towards a framework and a benchmark for testing tools for multi-threaded programs", journal = j-CCPE, volume = "19", number = "3", pages = "267--279", day = "10", month = mar, year = "2007", CODEN = "CCPEBO", DOI = "https://doi.org/10.1002/cpe.1068", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Mon Dec 5 10:08:10 MST 2011", bibsource = "http://www.interscience.wiley.com/jpages/1532-0626; https://www.math.utah.edu/pub/tex/bib/ccpe.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Concurrency and Computation: Prac\-tice and Experience", journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626", onlinedate = "1 Aug 2006", } @Article{Gabor:2007:FES, author = "Ron Gabor and Shlomo Weiss and Avi Mendelson", title = "Fairness enforcement in switch on event multithreading", journal = j-TACO, volume = "4", number = "3", pages = "15:1--15:??", month = sep, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1145/1275937.1275939", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jun 16 11:41:20 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The need to reduce power and complexity will increase the interest in Switch On Event multithreading (coarse-grained multithreading). Switch On Event multithreading is a low-power and low-complexity mechanism to improve processor throughput by switching threads on execution stalls. Fairness may, however, become a problem in a multithreaded processor. Unless fairness is properly handled, some threads may starve while others consume all of the processor cycles. Heuristics that were devised in order to improve fairness in simultaneous multithreading are not applicable to Switch On Event multithreading. This paper defines the fairness metric using the ratio of the individual threads' speedups and shows how it can be enforced in Switch On Event multithreading. Fairness is controlled by forcing additional thread switch points. These switch points are determined dynamically by runtime estimation of the single threaded performance of each of the individual threads. We analyze the impact of the fairness enforcement mechanism on aggregate IPC and weighted speedup. We present simulation results of the performance of Switch On Event multithreading. Switch On Event multithreading achieves an average aggregate IPC increase of 26\% over single thread and 12\% weighted speedup when no fairness is enforced. In this case, a sixth of our runs resulted in poor fairness in which one thread ran extremely slowly (10 to 100 times slower than its single-thread performance), while the other thread's performance was hardly affected. By using the proposed mechanism, we can guarantee fairness at different levels of strictness and, in most cases, even improve the weighted speedup.", acknowledgement = ack-nhfb, articleno = "15", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924", keywords = "coarse-grained multithreading; fairness; multithreading; performance; SOE; Switch on Event multithreading; throughput; weighted speedup", } @Article{Ghoting:2007:CCF, author = "Amol Ghoting and Gregory Buehrer and Srinivasan Parthasarathy and Daehyun Kim and Anthony Nguyen and Yen-Kuang Chen and Pradeep Dubey", title = "Cache-conscious frequent pattern mining on modern and emerging processors", journal = j-VLDB-J, volume = "16", number = "1", pages = "77--96", month = jan, year = "2007", CODEN = "VLDBFR", ISSN = "1066-8888 (print), 0949-877X (electronic)", ISSN-L = "1066-8888", bibdate = "Mon Jun 23 10:51:22 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Algorithms are typically designed to exploit the current state of the art in processor technology. However, as processor technology evolves, said algorithms are often unable to derive the maximum achievable performance on these modern architectures. In this paper, we examine the performance of frequent pattern mining algorithms on a modern processor. A detailed performance study reveals that even the best frequent pattern mining implementations, with highly efficient memory managers, still grossly under-utilize a modern processor. The primary performance bottlenecks are {\em poor data locality\/} and {\em low instruction level parallelism (ILP)}. We propose a {\em cache-conscious prefix tree\/} to address this problem. The resulting tree improves spatial locality and also enhances the benefits from hardware cache line prefetching. Furthermore, the design of this data structure allows the use of {\em path tiling}, a novel tiling strategy, to improve temporal locality. The result is an overall speedup of up to 3.2 when compared with state of the art implementations. We then show how these algorithms can be improved further by realizing a non-naive thread-based decomposition that targets {\em simultaneously multi-threaded processors (SMT)}. A key aspect of this decomposition is to ensure cache re-use between threads that are co-scheduled at a fine granularity. This optimization affords an additional speedup of 50\%, resulting in an overall speedup of up to 4.8. The proposed optimizations also provide performance improvements on SMPs, and will most likely be beneficial on emerging processors.", acknowledgement = ack-nhfb, fjournal = "VLDB Journal: Very Large Data Bases", journal-URL = "http://portal.acm.org/toc.cfm?id=J869", keywords = "architecture-conscious algorithms; association rule mining; cache-conscious data mining; frequent itemset mining; frequent pattern mining", } @Article{Goldwasser:2007:INP, author = "Michael H. Goldwasser and David Letscher", title = "Introducing network programming into a {CS1} course", journal = j-SIGCSE, volume = "39", number = "3", pages = "19--22", month = sep, year = "2007", CODEN = "SIGSD3", DOI = "https://doi.org/10.1145/1269900.1268793", ISSN = "0097-8418 (print), 2331-3927 (electronic)", ISSN-L = "0097-8418", bibdate = "Sat Nov 17 16:57:36 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/python.bib; https://www.math.utah.edu/pub/tex/bib/sigcse2000.bib", note = "Proceedings of the 12th Annual SIGCSE Conference on Innovation and Technology in Computer Science Education (ITiCSE'07).", abstract = "Incorporating advanced programming concepts into an introductory programming course has to be done carefully to avoid overwhelming the students. We describe our experiences doing network programming in a CS1 course taught in Python. The simplicity of the built-in libraries allowed a fair amount of networking to be introduced in a week-long module of the course. In this short time we had the students writing both multithreaded clients and servers.", acknowledgement = ack-nhfb, fjournal = "SIGCSE Bulletin (ACM Special Interest Group on Computer Science Education)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J688", } @Article{Gotsman:2007:TMS, author = "Alexey Gotsman and Josh Berdine and Byron Cook and Mooly Sagiv", title = "Thread-modular shape analysis", journal = j-SIGPLAN, volume = "42", number = "6", pages = "266--277", month = jun, year = "2007", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1273442.1250765", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 10:55:30 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "We present the first shape analysis for multithreaded programs that avoids the explicit enumeration of execution-interleavings. Our approach is to automatically infer a resource invariant associated with each lock that describes the part of the heap protected by the lock. This allows us to use a sequential shape analysis on each thread. We show that resource invariants of a certain class can be characterized as least fixed points and computed via repeated applications of shape analysis only on each individual thread. Based on this approach, we have implemented a thread-modular shape analysis tool and applied it to concurrent heap-manipulating code from Windows device drivers.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "abstract interpretation; concurrent programming; shape analysis; static analysis", } @Article{Gravvanis:2007:PPA, author = "George A. Gravvanis and Victor N. Epitropou and Konstantinos M. Giannoutakis", title = "On the performance of parallel approximate inverse preconditioning using {Java} multithreading techniques", journal = j-APPL-MATH-COMP, volume = "190", number = "1", pages = "255--270", day = "1", month = jul, year = "2007", CODEN = "AMHCBQ", ISSN = "0096-3003 (print), 1873-5649 (electronic)", ISSN-L = "0096-3003", bibdate = "Sat Jul 12 09:03:06 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.sciencedirect.com/science/journal/00963003", acknowledgement = ack-nhfb, fjournal = "Applied Mathematics and Computation", journal-URL = "http://www.sciencedirect.com/science/journal/00963003", } @Article{Hur:2007:MSM, author = "Ibrahim Hur and Calvin Lin", title = "Memory scheduling for modern microprocessors", journal = j-TOCS, volume = "25", number = "4", pages = "10:1--10:??", month = dec, year = "2007", CODEN = "ACSYEC", DOI = "https://doi.org/10.1145/1314299.1314301", ISSN = "0734-2071 (print), 1557-7333 (electronic)", ISSN-L = "0734-2071", bibdate = "Mon Jun 16 17:52:15 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/tocs/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The need to carefully schedule memory operations has increased as memory performance has become increasingly important to overall system performance. This article describes the adaptive history-based (AHB) scheduler, which uses the history of recently scheduled operations to provide three conceptual benefits: (1) it allows the scheduler to better reason about the delays associated with its scheduling decisions, (2) it provides a mechanism for combining multiple constraints, which is important for increasingly complex DRAM structures, and (3) it allows the scheduler to select operations so that they match the program's mixture of Reads and Writes, thereby avoiding certain bottlenecks within the memory controller.\par We have previously evaluated this scheduler in the context of the IBM Power5. When compared with the state of the art, this scheduler improves performance by 15.6\\%, 9.9\\%, and 7.6\\% for the Stream, NAS, and commercial benchmarks, respectively. This article expands our understanding of the AHB scheduler in a variety of ways. Looking backwards, we describe the scheduler in the context of prior work that focused exclusively on avoiding bank conflicts, and we show that the AHB scheduler is superior for the IBM Power5, which we argue will be representative of future microprocessor memory controllers. Looking forwards, we evaluate this scheduler in the context of future systems by varying a number of microarchitectural features and hardware parameters. For example, we show that the benefit of this scheduler increases as we move to multithreaded environments.", acknowledgement = ack-nhfb, articleno = "10", fjournal = "ACM Transactions on Computer Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J774", keywords = "adaptive history-based scheduling; memory scheduling; memory system performance", } @InBook{Kollias:2007:APC, author = "Giorgos Kollias and Efstratios Gallopoulos", title = "Asynchronous {PageRank} computation in an interactive multithreading environment", volume = "07071", publisher = "International Begegnungs- und Forschungszentrum f{\"u}r Informatik", address = "Wadern, Germany", pages = "????", year = "2007", ISBN = "????", ISBN-13 = "????", bibdate = "Fri Feb 19 15:32:30 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/pagerank.bib", series = "Dagstuhl seminar proceedings", URL = "http://drops.dagstuhl.de/opus/volltexte/2007/1065/pdf/07071.KolliasGiorgios.Paper.1065", acknowledgement = ack-nhfb, } @Article{Kumar:2007:ESI, author = "Nagendra J. Kumar and Vasanth Asokan and Siddhartha Shivshankar and Alexander G. Dean", title = "Efficient software implementation of embedded communication protocol controllers using asynchronous software thread integration with time- and space-efficient procedure calls", journal = j-TECS, volume = "6", number = "1", pages = "2:1--2:??", month = feb, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1145/1210268.1210270", ISSN = "1539-9087 (print), 1558-3465 (electronic)", ISSN-L = "1539-9087", bibdate = "Thu Jun 12 15:20:58 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The overhead of context switching limits efficient scheduling of multiple concurrent threads on a uniprocessor when real-time requirements exist. A software-implemented protocol controller may be crippled by this problem. The available idle time may be too short to recover through context switching, so only the primary thread can execute during message activity, slowing the secondary threads and potentially missing deadlines. Asynchronous software thread integration (ASTI) uses coroutine calls and integration, letting threads make independent progress efficiently, and reducing the needed context switches. We demonstrate the methods with a software implementation of an automotive communication protocol (J1850) and several secondary threads.", acknowledgement = ack-nhfb, articleno = "2", fjournal = "ACM Transactions on Embedded Computing Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J840", keywords = "asynchronous software thread integration; fine-grain concurrency; hardware to software migration; J1850; software-implemented communication protocol controllers", } @Article{Laudon:2007:CWM, author = "James Laudon and Lawrence Spracklen", title = "The Coming Wave of Multithreaded Chip Multiprocessors", journal = j-INT-J-PARALLEL-PROG, volume = "35", number = "3", pages = "299--330", month = jun, year = "2007", CODEN = "IJPPE5", DOI = "https://doi.org/10.1007/s10766-007-0033-6", ISSN = "0885-7458 (print), 1573-7640 (electronic)", ISSN-L = "0885-7458", bibdate = "Wed Jul 9 16:06:21 MDT 2008", bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=35&issue=3; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=35&issue=3&spage=299", acknowledgement = ack-nhfb, fjournal = "International Journal of Parallel Programming", journal-URL = "http://link.springer.com/journal/10766", keywords = "Chip multiprocessing; multithreading; parallel programming; performance", } @Article{Le:2007:IPM, author = "H. Q. Le and W. J. Starke and J. S. Fields and F. P. O'Connell and D. Q. Nguyen and B. J. Ronchetti and W. M. Sauer and E. M. Schwarz and M. T. Vaden", title = "{IBM POWER6} microarchitecture", journal = j-IBM-JRD, volume = "51", number = "6", pages = "639--??", month = nov, year = "2007", CODEN = "IBMJAE", ISSN = "0018-8646 (print), 2151-8556 (electronic)", ISSN-L = "0018-8646", bibdate = "Mon Jul 7 21:49:07 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.research.ibm.com/journal/", URL = "http://www.research.ibm.com/journal/rd/516/le.html", abstract = "This paper describes the implementation of the IBM POWER6 microprocessor, a two-way simultaneous multithreaded (SMT) dual-core chip whose key features include binary compatibility with IBM POWER5 microprocessor-based systems; increased functional capabilities, such as decimal floating-point and vector multimedia extensions; significant reliability, availability, and serviceability enhancements; and robust scalability with up to 64 physical processors. Based on a new industry-leading high-frequency core architecture with enhanced SMT and driven by a high-throughput symmetric multiprocessing (SMP) cache and memory subsystem, the POWER6 chip achieves a significant performance boost compared with its predecessor, the POWER5 chip. Key extensions to the coherence protocol enable POWER6 microprocessor-based systems to achieve better SMP scalability while enabling reductions in system packaging complexity and cost.", acknowledgement = ack-nhfb, fjournal = "IBM Journal of Research and Development", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5288520", } @Article{Leadbitter:2007:NM, author = "P. Leadbitter and D. Page and N. P. Smart", title = "Nondeterministic Multithreading", journal = j-IEEE-TRANS-COMPUT, volume = "56", number = "7", pages = "992--998", month = jul, year = "2007", CODEN = "ITCOB4", DOI = "https://doi.org/10.1109/TC.2007.1049", ISSN = "0018-9340 (print), 1557-9956 (electronic)", ISSN-L = "0018-9340", bibdate = "Mon Jul 4 15:03:40 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2000.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=4216296", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Computers", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12", } @Article{Li:2007:CET, author = "Peng Li and Steve Zdancewic", title = "Combining events and threads for scalable network services implementation and evaluation of monadic, application-level concurrency primitives", journal = j-SIGPLAN, volume = "42", number = "6", pages = "189--199", month = jun, year = "2007", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1273442.1250756", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 10:55:30 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This paper proposes to combine two seemingly opposed programming models for building massively concurrent network services: the event-driven model and the multithreaded model. The result is a hybrid design that offers the best of both worlds--the ease of use and expressiveness of threads and the flexibility and performance of events.\par This paper shows how the hybrid model can be implemented entirely at the application level using concurrency monads in Haskell, which provides type-safe abstractions for both events and threads. This approach simplifies the development of massively concurrent software in a way that scales to real-world network services. The Haskell implementation supports exceptions, symmetrical multiprocessing, software transactional memory, asynchronous I/O mechanisms and application-level network protocol stacks. Experimental results demonstrate that this monad-based approach has good performance: the threads are extremely lightweight (scaling to ten million threads), and the I/O performance compares favorably to that of Linux NPTL. tens of thousands of simultaneous, mostly-idle client connections. Such massively-concurrent programs are difficult to implement, especially when other requirements, such as high performance and strong security, must also be met.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "concurrency; event; Haskell; implementation; monad; networking; programming; scalability; thread", } @Article{Madan:2007:PEA, author = "Niti Madan and Rajeev Balasubramonian", title = "Power Efficient Approaches to Redundant Multithreading", journal = j-IEEE-TRANS-PAR-DIST-SYS, volume = "18", number = "8", pages = "1066--1079", month = aug, year = "2007", CODEN = "ITDSEO", DOI = "https://doi.org/10.1109/TPDS.2007.1090", ISSN = "1045-9219 (print), 1558-2183 (electronic)", ISSN-L = "1045-9219", bibdate = "Thu Jul 3 14:26:53 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Parallel and Distributed Systems", journal-URL = "http://www.computer.org/tpds/archives.htm", } @Article{Mahesri:2007:HSS, author = "Aqeel Mahesri and Nicholas J. Wang and Sanjay J. Patel", title = "Hardware support for software controlled multithreading", journal = j-COMP-ARCH-NEWS, volume = "35", number = "1", pages = "3--12", month = mar, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1241601.1241606", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:47:26 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Chip multi-processors have emerged as one of the most effective uses of the huge number of transistors available today and in the future, but questions remain as to the best way to leverage CMPs to accelerate single threaded applications. Previous approaches rely on significant speculation to accomplish this goal. Our proposal, NXA, is less speculative than previous proposals, relying heavily on software to guarantee thread correctness, though still allowing parallelism in the presence of ambiguous dependences. It divides a single thread of execution into multiple using the master-worker paradigm where some set of master threads execute code that spawns tasks for other, worker threads. The master threads generally consist of performance critical instructions that can prefetch data, compute critical control decisions, or compute performance critical dataflow slices. This prevents non-critical instructions from competing with critical instructions for processor resources, allowing the critical thread (and thus the workload) to complete faster. Empirical results from performance simulation show a 20\% improvement in performance on a 2-way CMP machine, demonstrating that software controlled multithreading can indeed provide a benefit in the presence of hardware support.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", remark = "{DASCMP'06}", } @Article{Marowka:2007:PCD, author = "Ami Marowka", title = "Parallel computing on any desktop", journal = j-CACM, volume = "50", number = "9", pages = "74--78", month = sep, year = "2007", CODEN = "CACMA2", DOI = "https://doi.org/10.1145/1284621.1284622", ISSN = "0001-0782 (print), 1557-7317 (electronic)", ISSN-L = "0001-0782", bibdate = "Mon Jun 16 18:32:57 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Parallelization lets applications exploit the high throughput of new multicore processors, and the OpenMP parallel programming model helps developers create multithreaded applications.", acknowledgement = ack-nhfb, fjournal = "Communications of the ACM", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J79", } @Article{Minh:2007:EHT, author = "Chi Cao Minh and Martin Trautmann and JaeWoong Chung and Austen McDonald and Nathan Bronson and Jared Casper and Christos Kozyrakis and Kunle Olukotun", title = "An effective hybrid transactional memory system with strong isolation guarantees", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "69--80", month = may, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1250662.1250673", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "We propose signature-accelerated transactional memory (SigTM), a hybrid TM system that reduces the overhead of software transactions. SigTM uses hardware signatures to track the read-set and write-set for pending transactions and perform conflict detection between concurrent threads. All other transactional functionality, including data versioning, is implemented in software. Unlike previously proposed hybrid TM systems, SigTM requires no modifications to the hardware caches, which reduces hardware cost and simplifies support for nested transactions and multithreaded processor cores. SigTM is also the first hybrid TM system to provide strong isolation guarantees between transactional blocks and non-transactional accesses without additional read and write barriers in non-transactional code.\par Using a set of parallel programs that make frequent use of coarse-grain transactions, we show that SigTM accelerates software transactions by 30\% to 280\%. For certain workloads, SigTM can match the performance of a full-featured hardware TM system, while for workloads with large read-sets it can be up to two times slower. Overall, we show that SigTM combines the performance characteristics and strong isolation guarantees of hardware TM implementations with the low cost and flexibility of software TM systems.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", keywords = "multi-core architectures; parallel programming; strong isolation; transactional memory", } @Article{Morandini:2007:UDS, author = "Marco Morandini and Paolo Mantegazza", title = "Using dense storage to solve small sparse linear systems", journal = j-TOMS, volume = "33", number = "1", pages = "5:1--5:12", month = mar, year = "2007", CODEN = "ACMSCU", DOI = "https://doi.org/10.1145/1206040.1206045", ISSN = "0098-3500 (print), 1557-7295 (electronic)", ISSN-L = "0098-3500", bibdate = "Sat Apr 14 09:48:58 MDT 2007", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/toms.bib", abstract = "A data structure is used to build a linear solver specialized for relatively small sparse systems. The proposed solver, optimized for run-time performance at the expense of memory footprint, outperforms widely used direct and sparse solvers for systems with between 100 and 3000 equations. A multithreaded version of the solver is shown to give some speedups for problems with medium fill-in, while it does not give any benefit for very sparse problems.", acknowledgement = ack-nhfb, articleno = "5", fjournal = "ACM Transactions on Mathematical Software (TOMS)", journal-URL = "http://dl.acm.org/pub.cfm?id=J782", } @Article{Musuvathi:2007:ICB, author = "Madanlal Musuvathi and Shaz Qadeer", title = "Iterative context bounding for systematic testing of multithreaded programs", journal = j-SIGPLAN, volume = "42", number = "6", pages = "446--455", month = jun, year = "2007", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1273442.1250785", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 10:55:30 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Multithreaded programs are difficult to get right because of unexpected interaction between concurrently executing threads. Traditional testing methods are inadequate for catching subtle concurrency errors which manifest themselves late in the development cycle and post-deployment. Model checking or systematic exploration of program behavior is a promising alternative to traditional testing methods. However, it is difficult to perform systematic search on large programs as the number of possible program behaviors grows exponentially with the program size. Confronted with this state-explosion problem, traditional model checkers perform iterative depth-bounded search. Although effective for message-passing software, iterative depth-bounding is inadequate for multithreaded software.\par This paper proposes iterative context-bounding, a new search algorithm that systematically explores the executions of a multithreaded program in an order that prioritizes executions with fewer context switches. We distinguish between preempting and nonpreempting context switches, and show that bounding the number of preempting context switches to a small number significantly alleviates the state explosion, without limiting the depth of explored executions. We show both theoretically and empirically that context-bounded search is an effective method for exploring the behaviors of multithreaded programs. We have implemented our algorithm in two model checkers and applied it to a number of real-world multithreaded programs. Our implementation uncovered 9 previously unknown bugs in our benchmarks, each of which was exposed by an execution with at most 2 preempting context switches. Our initial experience with the technique is encouraging and demonstrates that iterative context-bounding is a significant improvement over existing techniques for testing multithreaded programs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "concurrency; context-bounding; model checking; multithreading; partial-order reduction; shared-memory programs; software testing", } @Article{Naik:2007:CMA, author = "Mayur Naik and Alex Aiken", title = "Conditional must not aliasing for static race detection", journal = j-SIGPLAN, volume = "42", number = "1", pages = "327--338", month = jan, year = "2007", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1190216.1190265", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 10:53:14 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Race detection algorithms for multi-threaded programs using the common lock-based synchronization idiom must correlate locks with the memory locations they guard. The heart of a proof of race freedom is showing that if two locks are distinct, then the memory locations they guard are also distinct. This is an example of a general property we call conditional must not aliasing: Under the assumption that two objects are not aliased, prove that two other objects are not aliased. This paper introduces and gives an algorithm for conditional must not alias analysis and discusses experimental results for sound race detection of Java programs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "concurrency; Java; multi-threading; static race detection; synchronization", } @Article{Narayanasamy:2007:ACB, author = "Satish Narayanasamy and Zhenghao Wang and Jordan Tigani and Andrew Edwards and Brad Calder", title = "Automatically classifying benign and harmful data races all using replay analysis", journal = j-SIGPLAN, volume = "42", number = "6", pages = "22--31", month = jun, year = "2007", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1250734.1250738", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 10:55:30 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Many concurrency bugs in multi-threaded programs are due to dataraces. There have been many efforts to develop static and dynamic mechanisms to automatically find the data races. Most of the prior work has focused on finding the data races and eliminating the false positives.\par In this paper, we instead focus on a dynamic analysis technique to automatically classify the data races into two categories --- the dataraces that are potentially benign and the data races that are potentially harmful. A harmful data race is a real bug that needs to be fixed. This classification is needed to focus the triaging effort on those data races that are potentially harmful. Without prioritizing the data races we have found that there are too many data races to triage. Our second focus is to automatically provide to the developer a reproducible scenario of the data race, which allows the developer to understand the different effects of a harmful data race on a program's execution.\par To achieve the above, we record a multi-threaded program's execution in a replay log. The replay log is used to replay the multi-threaded program, and during replay we find the data races using a happens-before based algorithm. To automatically classify if a data race that we find is potentially benign or potentially harmful, were play the execution twice for a given data race --- one for each possible order between the conflicting memory operations. If the two replays for the two orders produce the same result, then we classify the data race to be potentially benign. We discuss our experiences in using our replay based dynamic data race checker on several Microsoft applications.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "benign data races; concurrency Bbugs; replay", } @Article{Ostler:2007:IHT, author = "Chris Ostler and Karam S. Chatha and Vijay Ramamurthi and Krishnan Srinivasan", title = "{ILP} and heuristic techniques for system-level design on network processor architectures", journal = j-TODAES, volume = "12", number = "4", pages = "48:1--48:??", month = sep, year = "2007", CODEN = "ATASFO", DOI = "https://doi.org/10.1145/1278349.1278361", ISSN = "1084-4309 (print), 1557-7309 (electronic)", ISSN-L = "1084-4309", bibdate = "Thu Jun 12 18:09:35 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/todaes/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Network processors incorporate several architectural features, including symmetric multiprocessing (SMP), block multithreading, and multiple memory elements, to support the high-performance requirements of current day applications. This article presents automated system-level design techniques for application development on such architectures. We propose integer linear programming formulations and heuristic techniques for process allocation and data mapping on SMP and block-multithreading-based network processors. The techniques incorporate process transformations and multithreading-aware data mapping to maximize the throughput of the application. The article presents experimental results that evaluate the techniques by implementing network processing applications on the Intel IXP 2400 architecture.", acknowledgement = ack-nhfb, articleno = "48", fjournal = "ACM Transactions on Design Automation of Electronic Systems (TODAES)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J776", keywords = "block multithreading; multiprocessor", } @Article{Park:2007:MEP, author = "Soyeon Park and Weihang Jiang and Yuanyuan Zhou and Sarita Adve", title = "Managing energy-performance tradeoffs for multithreaded applications on multiprocessor architectures", journal = j-SIGMETRICS, volume = "35", number = "1", pages = "169--180", month = jun, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1145/1254882.1254902", ISSN = "0163-5999 (print), 1557-9484 (electronic)", ISSN-L = "0163-5999", bibdate = "Fri Jun 27 09:42:48 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "In modern computers, non-performance metrics such as energy consumption have become increasingly important, requiring tradeoff with performance. A recent work has proposed performance-guaranteed energy management, but it is designed specifically for sequential applications and cannot be used to a large class of multithreaded applications running on high end computers and data servers.\par To address the above problem, this paper makes the first attempt to provide performance-guaranteed energy management for multithreaded applications on multiprocessor architectures. We first conduct a comprehensive study on the effects of energy adaptation on thread synchronizations and show that a multithreaded application suffers from not only local slowdowns due to energy adaptation, but also significant slowdowns propagated from other threads because of synchronization. Based on these findings, we design three Synchronization-Aware (SA) algorithms, LWT (Lock Waiting Time-based), CSL (Critical Section Length-based) and ODP (Operation Delay Propagation-based) algorithms, to estimate the energy adaptation-induced slowdowns on each thread. The local slowdowns are then combined across multiple threads via three aggregation methods (MAX, AVG and SUM) to estimate the overall application slowdown.\par We evaluate our methods using a large multithreaded commercial application, IBM DB2 with industrial-strength online transaction processing (OLTP) workloads, and six SPLASH parallel scientific applications. Our experimental results show that LWT combined with the MAX aggregation method not only controls the performance slow down within the specified limits but also conserves the most energy.", acknowledgement = ack-nhfb, fjournal = "ACM SIGMETRICS Performance Evaluation Review", journal-URL = "http://portal.acm.org/toc.cfm?id=J618", keywords = "energy and performance tradeoffs; low power design; memory energy management; multithreaded applications", } @Article{Permandla:2007:TSP, author = "Pratibha Permandla and Michael Roberson and Chandrasekhar Boyapati", title = "A type system for preventing data races and deadlocks in the {Java Virtual Machine} language: 1", journal = j-SIGPLAN, volume = "42", number = "7", pages = "10--10", month = jul, year = "2007", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1254766.1254768", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 10:57:50 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "In previous work on SafeJava we presented a type system extension to the Java source language that statically prevents data races and deadlocks in multithreaded programs. SafeJava is expressive enough to support common programming patterns, its type checking is fast and scalable, and it requires little programming overhead. SafeJava thus offers a promising approach for making multithreaded programs more reliable. This paper presents a corresponding type system extension for the Java virtual machine language (JVML). We call the resulting language SafeJVML. Well-typed SafeJVML programs are guaranteed to be free of data races and deadlocks. Designing a corresponding type system for JVML is important because most Java code is shipped in the JVML format. Designing a corresponding type system for JVML is nontrivial because of important differences between Java and JVML. In particular, the absence of block structure in JVML programs and the fact that they do not use named local variables the way Java programs do make the type systems for Java and JVML significantly different. For example, verifying absence of races and deadlocks in JVML programs requires performing an alias analysis, something that was not necessary for verifying absence of races and deadlocks in Java programs. This paper presents static and dynamic semantics for Safe JVML. It also includes a proof that the SafeJVML type system is sound and that it prevents data races and deadlocks. To the best of our knowledge, this is the first type system for JVML that statically ensures absence of synchronization errors.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "data races; deadlocks; ownership types; SafeJava", } @Article{Pozniansky:2007:MEF, author = "Eli Pozniansky and Assaf Schuster", title = "{MultiRace}: efficient on-the-fly data race detection in multithreaded {C++} programs", journal = j-CCPE, volume = "19", number = "3", pages = "327--340", day = "10", month = mar, year = "2007", CODEN = "CCPEBO", DOI = "https://doi.org/10.1002/cpe.1064", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Mon Dec 5 10:08:10 MST 2011", bibsource = "http://www.interscience.wiley.com/jpages/1532-0626; https://www.math.utah.edu/pub/tex/bib/ccpe.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Concurrency and Computation: Prac\-tice and Experience", journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626", onlinedate = "1 Aug 2006", } @Article{Rosu:2007:ITO, author = "Grigore Ro{\c{s}}u and Koushik Sen", title = "An instrumentation technique for online analysis of multithreaded programs", journal = j-CCPE, volume = "19", number = "3", pages = "311--325", day = "10", month = mar, year = "2007", CODEN = "CCPEBO", DOI = "https://doi.org/10.1002/cpe.1066", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Mon Dec 5 10:08:10 MST 2011", bibsource = "http://www.interscience.wiley.com/jpages/1532-0626; https://www.math.utah.edu/pub/tex/bib/ccpe.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Concurrency and Computation: Prac\-tice and Experience", journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626", onlinedate = "1 Aug 2006", } @Article{Sharkey:2007:EOA, author = "Joseph J. Sharkey and Dmitry V. Ponomarev", title = "Exploiting Operand Availability for Efficient Simultaneous Multithreading", journal = j-IEEE-TRANS-COMPUT, volume = "56", number = "2", pages = "208--223", month = feb, year = "2007", CODEN = "ITCOB4", DOI = "https://doi.org/10.1109/TC.2007.28", ISSN = "0018-9340 (print), 1557-9956 (electronic)", ISSN-L = "0018-9340", bibdate = "Mon Jul 4 15:03:37 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2000.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=4042681", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Computers", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12", } @Article{Shi:2007:CCP, author = "Xudong Shi and Feiqi Su and Jih-kwon Peir and Ye Xia and Zhen Yang", title = "{CMP} cache performance projection: accessibility vs. capacity", journal = j-COMP-ARCH-NEWS, volume = "35", number = "1", pages = "13--20", month = mar, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1241601.1241607", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:47:26 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Efficient utilizing on-chip storage space on Chip-Multiprocessors (CMPs) has become an important research topic. Tradeoffs between data accessibility and effective on-chip capacity have been studied extensively. It requires costly simulations to understand a wide-spectrum of the design space. In this paper, we first develop an abstract model for understanding the performance impact with respect to data replication. To overcome the lack of real-time interactions among multiple cores in the abstract model, we propose a global stack simulation strategy to study the performance of a variety of cache organizations on CMPs. The global stack logically incorporates a shared stack and per-core private stacks to collect shared/private reuse (stack) distances for every memory reference in a single simulation pass. With the collected reuse distances, performance in terms of hits/misses and average memory access times can be calculated for various cache organizations. We verify the stack results against individual execution-driven simulations that consider realistic cache parameters and delays using a set of commercial multithreaded workloads. The results show that stack simulations can accurately model the performance of various cache organizations. The single-pass stack simulation results demonstrate that the effectiveness of various techniques for optimizing the CMP on-chip storage is closely related to the working sets of the workloads as well as to the total cache sizes.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", keywords = "CMP caches; data replication; performance modeling and projection; stack simulation", remark = "{DASCMP'06}", } @Article{Smaragdakis:2007:TIC, author = "Yannis Smaragdakis and Anthony Kay and Reimer Behrends and Michal Young", title = "Transactions with isolation and cooperation", journal = j-SIGPLAN, volume = "42", number = "10", pages = "191--210", month = oct, year = "2007", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1297027.1297042", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 11:00:28 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "We present the TIC (Transactions with Isolation and Cooperation) model for concurrent programming. TIC adds to standard transactional memory the ability for a transaction to observe the effects of other threads at selected points. This allows transactions to cooperate, as well as to invoke nonrepeatable or irreversible operations, such as I/O. Cooperating transactions run the danger of exposing intermediate state and of having other threads change the transaction's state. The TIC model protects against unanticipated interference by having the type system keep track of all operations that may (transitively) violate the atomicity of a transaction and require the programmer to establish consistency at appropriate points. The result is a programming model that is both general and simple. We have used the TIC model to re-engineer existing lock-based applications including a substantial multi-threaded web mail server and a memory allocator with coarse-grained locking. Our experience confirms the features of the TIC model: It is convenient for the programmer, while maintaining the benefits of transactional memory.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "nested transactions; open-nesting; punctuation; TIC; transactional memory", } @Book{Sweetman:2007:SMR, author = "Dominic Sweetman", title = "See {MIPS} Run", publisher = pub-MORGAN-KAUFMANN, address = pub-MORGAN-KAUFMANN:adr, edition = "Second", pages = "xix + 492", year = "2007", ISBN = "0-12-088421-6", ISBN-13 = "978-0-12-088421-6", LCCN = "QA76.9.A73 S88 2007", bibdate = "Thu Jun 20 10:21:55 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/linux.bib; https://www.math.utah.edu/pub/tex/bib/master.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/unix.bib", acknowledgement = ack-nhfb, keywords = "Embedded computer systems --- Programming; MIPS (Computer architecture); RISC microprocessors", libnote = "Not yet in my library.", tableofcontents = "1: RISCs and MIPS architectures / 1 \\ 2: MIPS architecture / 29 \\ 3: Coprocessor 0: MIPS processor control / 53 \\ 4: How caches work on MIPS processors / 79 \\ 5: Exceptions, interrupts, and initialization / 105 \\ 6: Low-level memory management and the TLB / 131 \\ 7: Floating-point support / 151 \\ 8: Complete guide to the MIPS instruction set / 183 \\ 9: Reading MIPS assembly language / 263 \\ 10: Porting software to the MIPS architecture / 279 \\ 11: MIPS software standards (ABIs) / 311 \\ 12: Debugging MIPS designs - debug and profiling features / 339 \\ 13: GNU/Linux from eight miles high / 363 \\ 14: How hardware and software work together / 371 \\ 15: MIPS specific issues in the Linux kernel / 399 \\ 16: Linux application code, PIC, and libraries / 409 \\ Appendix A: MIPS multithreading / 415 \\ Appendix B: Other optional extensions to the MIPS instruction set", } @Article{Tam:2007:TCS, author = "David Tam and Reza Azimi and Michael Stumm", title = "Thread clustering: sharing-aware scheduling on {SMP--CMP--SMT} multiprocessors", journal = j-OPER-SYS-REV, volume = "41", number = "3", pages = "47--58", month = jun, year = "2007", CODEN = "OSRED8", DOI = "https://doi.org/10.1145/1272996.1273004", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Fri Jun 20 17:16:31 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The major chip manufacturers have all introduced chip multiprocessing (CMP) and simultaneous multithreading (SMT) technology into their processing units. As a result, even low-end computing systems and game consoles have become shared memory multiprocessors with L1 and L2 cache sharing within a chip. Mid- and large-scale systems will have multiple processing chips and hence consist of an SMP-CMP-SMT configuration with non-uniform data sharing overheads. Current operating system schedulers are not aware of these new cache organizations, and as a result, distribute threads across processors in a way that causes many unnecessary, long-latency cross-chip cache accesses.\par In this paper we describe the design and implementation of a scheme to schedule threads based on sharing patterns detected online using features of standard performance monitoring units (PMUs) available in today's processing units. The primary advantage of using the PMU infrastructure is that it is fine-grained (down to the cache line) and has relatively low overhead. We have implemented our scheme in Linux running on an 8- way Power5 SMP-CMP-SMT multi-processor. For commercial multithreaded server workloads (VolanoMark, SPECjbb, and RUBiS), we are able to demonstrate reductions in cross-chip cache accesses of up to 70\%. These reductions lead to application-reported performance improvements of up to 7\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGOPS Operating Systems Review", keywords = "affinity scheduling; cache behavior; cache locality; CMP; detecting sharing; hardware performance counters; hardware performance monitors; multithreading; performance monitoring unit; resource allocation; shared caches; sharing; simultaneous multithreading; single-chip multiprocessors; SMP; SMT; thread migration; thread placement; thread scheduling", } @Article{Walcott:2007:DPA, author = "Kristen R. Walcott and Greg Humphreys and Sudhanva Gurumurthi", title = "Dynamic prediction of architectural vulnerability from microarchitectural state", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "516--527", month = may, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1250662.1250726", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Transient faults due to particle strikes are a key challenge in microprocessor design. Driven by exponentially increasing transistor counts, per-chip faults are a growing burden. To protect against soft errors, redundancy techniques such as redundant multithreading (RMT) are often used. However, these techniques assume that the probability that a structural fault will result in a soft error (i.e., the Architectural Vulnerability Factor (AVF)) is 100 percent, unnecessarily draining processor resources. Due to the high cost of redundancy, there have been efforts to throttle RMT at runtime. To date, these methods have not incorporated an AVF model and therefore tend to be ad hoc. Unfortunately, computing the AVF of complex microprocessor structures (e.g., the ISQ) can be quite involved.\par To provide probabilistic guarantees about fault tolerance, we have created a rigorous characterization of AVF behavior that can be easily implemented in hardware. We experimentally demonstrate AVF variability within and across the SPEC2000 benchmarks and identify strong correlations between structural AVF values and a small set of processor metrics. Using these simple indicators as predictors, we create a proof-of-concept RMT implementation that demonstrates that AVF prediction can be used to maintain a low fault tolerance level without significant performance impact.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", keywords = "architecture vulnerability factor; microarchitecture; performance; redundant multithreading; reliability", } @Article{Wang:2007:EAP, author = "Perry H. Wang and Jamison D. Collins and Gautham N. Chinya and Hong Jiang and Xinmin Tian and Milind Girkar and Nick Y. Yang and Guei-Yuan Lueh and Hong Wang", title = "{EXOCHI}: architecture and programming environment for a heterogeneous multi-core multithreaded system", journal = j-SIGPLAN, volume = "42", number = "6", pages = "156--166", month = jun, year = "2007", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1250734.1250753", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 10:55:30 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Future mainstream microprocessors will likely integrate specialized accelerators, such as GPUs, onto a single die to achieve better performance and power efficiency. However, it remains a keen challenge to program such a heterogeneous multicore platform, since these specialized accelerators feature ISAs and functionality that are significantly different from the general purpose CPU cores. In this paper, we present EXOCHI: (1) Exoskeleton Sequencer (EXO), an architecture to represent heterogeneous accelerators as ISA-based MIMD architecture resources, and a shared virtual memory heterogeneous multithreaded program execution model that tightly couples specialized accelerator cores with general-purpose CPU cores, and (2) C for Heterogeneous Integration (CHI), an integrated C/C++ programming environment that supports accelerator-specific inline assembly and domain-specific languages. The CHI compiler extends the OpenMP pragma for heterogeneous multithreading programming, and produces a single fat binary with code sections corresponding to different instruction sets. The runtime can judiciously spread parallel computation across the heterogeneous cores to optimize performance and power.\par We have prototyped the EXO architecture on a physical heterogeneous platform consisting of an Intel{\reg} Core{\TM} 2 Duo processor and an 8-core 32-thread Intel{\reg} Graphics Media Accelerator X3000. In addition, we have implemented the CHI integrated programming environment with the Intel{\reg} C++ Compiler, runtime toolset, and debugger. On the EXO prototype system, we have enhanced a suite of production-quality media kernels for video and image processing to utilize the accelerator through the CHI programming interface, achieving significant speedup (1.41X to10.97X) over execution on the IA32 CPU alone.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "GPU; heterogeneous multi-cores; openMP", } @Article{Wang:2007:OSC, author = "Qin Wang and Junpu Chen and Weihua Zhang and Min Yang and Binyu Zang", title = "Optimizing software cache performance of packet processing applications", journal = j-SIGPLAN, volume = "42", number = "7", pages = "227--236", month = jul, year = "2007", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1273444.1254808", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 10:57:50 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Network processors (NPs) are widely used in many types of networking equipment due to their high performance and flexibility. For most NPs, software cache is used instead of hardware cache due to the chip area, cost and power constraints. Therefore, programmers should take full responsibility for software cache management which is neither intuitive nor easy to most of them. Actually, without an effective use of it, long memory access latency will be a critical limiting factor to overall applications. Prior researches like hardware multi-threading, wide-word accesses and packet access combination for caching have already been applied to help programmers to overcome this bottleneck. However, most of them do not make enough use of the characteristics of packet processing applications and often perform intraprocedural optimizations only. As a result, the binary codes generated by those techniques often get lower performance than that comes from hand-tuned assembly programming for some applications. In this paper, we propose an algorithm including two techniques --- Critical Path Based Analysis (CPBA) and Global Adaptive Localization (GAL), to optimize the software cache performance of packet processing applications. Packet processing applications usually have several hot paths and CPBA tries to insert localization instructions according to their execution frequencies. For further optimizations, GAL eliminates some redundant localization instructions by interprocedural analysis and optimizations. Our algorithm is applied on some representative applications. Experiment results show that it leads to an average speedup by a factor of 1.974.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "local memory; network processor; optimization", } @Article{Yan:2007:HMC, author = "Jun Yan and Wei Zhang", title = "Hybrid multi-core architecture for boosting single-threaded performance", journal = j-COMP-ARCH-NEWS, volume = "35", number = "1", pages = "141--148", month = mar, year = "2007", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1241601.1241603", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:47:26 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The scaling of technology and the diminishing return of complicated uniprocessors have driven the industry towards multicore processors. While multithreaded applications can naturally leverage the enhanced throughput of multi-core processors, a large number of important applications are single-threaded, which cannot automatically harness the potential of multi-core processors. In this paper, we propose a compiler-driven heterogeneous multicore architecture, consisting of tightly-integrated VLIW (Very Long Instruction Word) and superscalar processors on a single chip, to automatically boost the performance of single-threaded applications without compromising the capability to support multithreaded programs. In the proposed multi-core architecture, while the high-performance VLIW core is used to run code segments with high instruction-level parallelism (ILP) extracted by the compiler; the superscalar core can be exploited to deal with the runtime events that are typically difficult for the VLIW core to handle, such as L2 cache misses. Our initial experimental results by running the preexecution thread on the superscalar core to mitigate the L2 cache misses of the main thread on the VLIW core indicate that the proposed VLIW/superscalar multi-core processor can automatically improve the performance of single-threaded general-purpose applications by up to 40.8\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Yang:2007:RUL, author = "Jin-Min Yang and Da-Fang Zhang and Xue-Dong Yang and Wen-Wei Li", title = "Reliable user-level rollback recovery implementation for multithreaded processes on windows", journal = j-SPE, volume = "37", number = "3", pages = "331--346", month = mar, year = "2007", CODEN = "SPEXBL", DOI = "https://doi.org/10.1002/spe.771", ISSN = "0038-0644 (print), 1097-024X (electronic)", ISSN-L = "0038-0644", bibdate = "Wed Oct 17 18:33:14 MDT 2007", bibsource = "http://www.interscience.wiley.com/jpages/0038-0644; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www3.interscience.wiley.com/journalfinder.html", acknowledgement = ack-nhfb, fjournal = "Software---Practice and Experience", journal-URL = "http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1097-024X", onlinedate = "24 Oct 2006", } @Article{Zebchuk:2007:BBC, author = "J. Zebchuk and A. Moshovos", title = "A Building Block for Coarse-Grain Optimizations in the On-Chip Memory Hierarchy", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "6", number = "2", pages = "33--36", month = feb, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2007.9", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Current on-chip block-centric memory hierarchies exploit access patterns at the fine-grain scale of small blocks. Several recently proposed memory hierarchy enhancements for coherence traffic reduction and prefetching suggest that additional useful patterns emerge with a macroscopic, coarse-grain view. This paper presents RegionTracker, a dual-grain, on-chip cache design that exposes coarse-grain behavior while maintaining block-level communication. RegionTracker eliminates the extraneous, often imprecise coarse-grain tracking structures of previous proposals. It can be used as the building block for coarse-grain optimizations, reducing their overall cost and easing their adoption. Using full-system simulation of a quad-core chip multiprocessor and commercial workloads, we demonstrate that RegionTracker overcomes the inefficiencies of previous coarse-grain cache designs. We also demonstrate how RegionTracker boosts the benefits and reduces the cost of a previously proposed snoop reduction technique.", acknowledgement = ack-nhfb, fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "access patterns; Bandwidth; cache storage; Cache storage; coarse-grain optimizations; coherence traffic reduction; Cost function; Design optimization; Explosions; Information management; Memory management; Multithreading; on-chip memory hierarchy; optimising compilers; Prefetching; prefetching; Proposals; quad-core chip multiprocessor; RegionTracker dual-grain on-chip cache design; system-on-chip", } @Article{Abdulla:2008:MCR, author = "Parosh Aziz Abdulla and Fr{\'e}d{\'e}ric Haziza and Mats Kindahl", title = "Model checking race-freeness", journal = j-COMP-ARCH-NEWS, volume = "36", number = "5", pages = "72--79", month = dec, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1556444.1556454", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Jun 26 11:50:56 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "With the introduction of highly concurrent systems in standard desktop computers, ensuring correctness of industrial-size concurrent programs is becoming increasingly important. One of the most important standards in use for developing multi-threaded programs is the POSIX Threads standard, commonly known as PThreads. Of particular importance, the analysis of industrial code should, as far as possible, be automatic and not require annotations or other forms of specifications of the code.\par Model checking has been one of the most successful approaches to program verification during the last two decades. The size and complexity of applications which can be handled have increased rapidly through integration with symbolic techniques. These methods are designed to work on finite (but large) state spaces. This framework fails to deal with several essential aspects of behaviours for multithreaded programs: there is no bound a priori on the number of threads which may arise in a given run of the system; each thread manipulates local variables which often range over unbounded domains; and the system has a dynamic structure in the sense that threads can be created and killed throughout execution of the system. In this paper we concentrate on checking a particular class of properties for concurrent programs, namely safety properties. In particular, we focus on race-freeness, that is, the absence of race conditions (also known as data races) in shared-variable pthreaded programs.\par We will follow a particular methodology which we have earlier developed for model checking general classes of infinite-state systems [1, 3, 6, 8, 9] and apply a symbolic backward reachability analysis to verify the safety property. Since we construct a model as an over-approximation of the original program, proving the safety property in the model implies that the property also holds in the original system. Surprisingly, it leads to a quite efficient analysis which can be carried out fully automatically.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Abraham:2008:DPS, author = "Erika {\'A}brah{\'a}m and Frank S. de Boer and Willem-Paul de Roever and Martin Steffen", title = "A Deductive Proof System for Multithreaded {Java} with Exceptions", journal = j-FUND-INFO, volume = "82", number = "4", pages = "391--463", month = jul, year = "2008", CODEN = "FUMAAJ", ISSN = "0169-2968 (print), 1875-8681 (electronic)", ISSN-L = "0169-2968", bibdate = "Sat Mar 5 17:06:39 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/fundinfo2000.bib; https://www.math.utah.edu/pub/tex/bib/java2000.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Fundamenta Informaticae", journal-URL = "http://content.iospress.com/journals/fundamenta-informaticae", } @Article{Adams:2008:ENE, author = "Michael D. Adams and R. Kent Dybvig", title = "Efficient nondestructive equality checking for trees and graphs", journal = j-SIGPLAN, volume = "43", number = "9", pages = "179--188", month = sep, year = "2008", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1411203.1411230", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Sep 23 17:31:25 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The Revised$^6$ Report on Scheme requires its generic equivalence predicate, equal?, to terminate even on cyclic inputs. While the terminating equal? can be implemented via a DFA-equivalence or union-find algorithm, these algorithms usually require an additional pointer to be stored in each object, are not suitable for multithreaded code due to their destructive nature, and may be unacceptably slow for the small acyclic values that are the most likely inputs to the predicate.\par This paper presents a variant of the union-find algorithm for equal? that addresses these issues. It performs well on large and small, cyclic and acyclic inputs by interleaving a low-overhead algorithm that terminates only for acyclic inputs with a more general algorithm that handles cyclic inputs. The algorithm terminates for all inputs while never being more than a small factor slower than whichever of the acyclic or union-find algorithms would have been faster. Several intermediate algorithms are also presented, each of which might be suitable for use in a particular application, though only the final algorithm is suitable for use in a library procedure, like equal?, that must work acceptably well for all inputs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "dfa equivalence; eq hash tables; equality; scheme; union-find", } @Article{Agrawal:2008:AWS, author = "Kunal Agrawal and Charles E. Leiserson and Yuxiong He and Wen Jing Hsu", title = "Adaptive work-stealing with parallelism feedback", journal = j-TOCS, volume = "26", number = "3", pages = "7:1--7:32", month = sep, year = "2008", CODEN = "ACSYEC", DOI = "https://doi.org/10.1145/1394441.1394443", ISSN = "0734-2071 (print), 1557-7333 (electronic)", ISSN-L = "0734-2071", bibdate = "Wed Sep 17 14:28:13 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/tocs/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/tocs.bib", abstract = "Multiprocessor scheduling in a shared multiprogramming environment can be structured as two-level scheduling, where a kernel-level job scheduler allots processors to jobs and a user-level thread scheduler schedules the work of a job on its allotted processors. We present a randomized work-stealing thread scheduler for fork-join multithreaded jobs that provides continual parallelism feedback to the job scheduler in the form of requests for processors. Our A-STEAL algorithm is appropriate for large parallel servers where many jobs share a common multiprocessor resource and in which the number of processors available to a particular job may vary during the job's execution. Assuming that the job scheduler never allots a job more processors than requested by the job's thread scheduler, A-STEAL guarantees that the job completes in near-optimal time while utilizing at least a constant fraction of the allotted processors.\par We model the job scheduler as the thread scheduler's adversary, challenging the thread scheduler to be robust to the operating environment as well as to the job scheduler's administrative policies. For example, the job scheduler might make a large number of processors available exactly when the job has little use for them. To analyze the performance of our adaptive thread scheduler under this stringent adversarial assumption, we introduce a new technique called {\em trim analysis,\/} which allows us to prove that our thread scheduler performs poorly on no more than a small number of time steps, exhibiting near-optimal behavior on the vast majority.\par More precisely, suppose that a job has work $ T_1 $ and span $ T_\infty $. On a machine with $P$ processors, A-STEAL completes the job in an expected duration of $ O(T_1 / \tilde {P} + T_\infty + L \lg P)$ time steps, where $L$ is the length of a scheduling quantum, and $ \tilde {P}$ denotes the $ O(T_\infty + L \lg P)$-trimmed availability. This quantity is the average of the processor availability over all time steps except the $ O(T_\infty + L \lg P)$ time steps that have the highest processor availability. When the job's parallelism dominates the trimmed availability, that is, $ \tilde {P} \ll T_1 / T_\infty $, the job achieves nearly perfect linear speedup. Conversely, when the trimmed mean dominates the parallelism, the asymptotic running time of the job is nearly the length of its span, which is optimal.\par We measured the performance of A-STEAL on a simulated multiprocessor system using synthetic workloads. For jobs with sufficient parallelism, our experiments confirm that A-STEAL provides almost perfect linear speedup across a variety of processor availability profiles. We compared A-STEAL with the ABP algorithm, an adaptive work-stealing thread scheduler developed by Arora et al. [1998] which does not employ parallelism feedback. On moderately to heavily loaded machines with large numbers of processors, A-STEAL typically completed jobs more than twice as quickly as ABP, despite being allotted the same number or fewer processors on every step, while wasting only 10\% of the processor cycles wasted by ABP.", acknowledgement = ack-nhfb, articleno = "7", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J774", keywords = "adaptive scheduling; adversary; instantaneous parallelism; job scheduling; multiprocessing; multiprogramming; parallel computation; parallelism feedback; processor allocation; randomized algorithm; space sharing; span; thread scheduling; trim analysis; two-level scheduling; work; work-stealing", } @Article{Anderson:2008:SCD, author = "Zachary Anderson and David Gay and Rob Ennals and Eric Brewer", title = "{SharC}: checking data sharing strategies for multithreaded {C}", journal = j-SIGPLAN, volume = "43", number = "6", pages = "149--158", month = jun, year = "2008", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1379022.1375600", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sat Mar 11 17:33:54 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Unintended or unmediated data sharing is a frequent cause of insidious bugs in multithreaded programs. We present a tool called SharC (short for Sharing Checker) that allows a user to write lightweight annotations to declare how they believe objects are being shared between threads in their program. SharC uses a combination of static and dynamic analyses to check that the program conforms to this specification.\par SharC allows any type to have one of five 'sharing modes' -- private to the current thread, read-only, shared under the control of a specified lock, intentionally racy, or checked dynamically. The dynamic mode uses run-time checking to verify that objects are either read-only, or only accessed by one thread. This allows us to check programs that would be difficult to check with a purely static system. If the user does not give a type an explicit annotation, then SharC uses a static type-qualifier analysis to infer that it is either private or should be checked dynamically.\par SharC allows objects to move between different sharing modes at runtime by using reference counting to check that there are no other references to the objects when they change mode.\par SharC's baseline dynamic analysis can check any C program, but is slow, and will generate false warnings about intentional data sharing. As the user adds more annotations, false warnings are reduced, and performance improves. We have found in practice that very few annotations are needed to describe all sharing and give reasonable performance. We ran SharC on 6 legacy C programs, summing to over 600k lines of code, and found that a total of only 60 simple annotations were needed to remove all false positives and to reduce performance overhead to only 2-14\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "data-race", } @Article{Athanasaki:2008:EPL, author = "Evangelia Athanasaki and Nikos Anastopoulos and Kornilios Kourtis and Nectarios Koziris", title = "Exploring the performance limits of simultaneous multithreading for memory intensive applications", journal = j-J-SUPERCOMPUTING, volume = "44", number = "1", pages = "64--97", month = apr, year = "2008", CODEN = "JOSUED", DOI = "https://doi.org/10.1007/s11227-007-0149-x", ISSN = "0920-8542 (print), 1573-0484 (electronic)", ISSN-L = "0920-8542", bibdate = "Wed Jul 9 17:32:34 MDT 2008", bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=44&issue=1; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=44&issue=1&spage=64", acknowledgement = ack-nhfb, fjournal = "The Journal of Supercomputing", journal-URL = "http://link.springer.com/journal/11227", keywords = "Instruction-level parallelism; Performance analysis; Simultaneous multithreading; Software prefetching; Speculative precomputation; Thread-level parallelism", } @Article{Auerbach:2008:FTG, author = "Joshua Auerbach and David F. Bacon and Rachid Guerraoui and Jesper Honig Spring and Jan Vitek", title = "Flexible task graphs: a unified restricted thread programming model for {Java}", journal = j-SIGPLAN, volume = "43", number = "7", pages = "1--11", month = jul, year = "2008", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1375657.1375659", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 11:05:54 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The disadvantages of unconstrained shared-memory multi-threading in Java, especially with regard to latency and determinism in realtime systems, have given rise to a variety of language extensions that place restrictions on how threads allocate, share, and communicate memory, leading to order-of-magnitude reductions in latency and jitter. However, each model makes different trade-offs with respect to expressiveness, efficiency, enforcement, and latency, and no one model is best for all applications.\par In this paper we present Flexible Task Graphs (Flexotasks), a single system that allows different isolation policies and mechanisms to be combined in an orthogonal manner, subsuming four previously proposed models as well as making it possible to use new combinations best suited to the needs of particular applications. We evaluate our implementation on top of the IBM Web-Sphere Real Time Java virtual machine using both a microbenchmark and a 30 KLOC avionics collision detector. We show that Flexotasks are capable of executing periodic threads at 10 KHz with a standard deviation of 1.2$ \mu $ s and that it achieves significantly better performance than RTSJ's scoped memory constructs while remaining impervious to interference from global garbage collection.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "Java Virtual Machine; memory management; ownership types; real-time systems", } @Article{Bahmann:2008:EFK, author = "Helge Bahmann and Konrad Froitzheim", title = "Extending futex for kernel to user notification", journal = j-OPER-SYS-REV, volume = "42", number = "5", pages = "18--26", month = jul, year = "2008", CODEN = "OSRED8", DOI = "https://doi.org/10.1145/1400097.1400100", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Wed Aug 6 16:54:12 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Threads in reactive applications need to service a multitude of events from different sources such as device drivers, communication channels or cooperating threads. While notification about these events can conceptually be understood as a form of 'synchronization', most operating systems (including Linux) do not provide a unified abstraction. This paper proposes to separate event delivery and notification, and to provide unified event notification through general-purpose synchronization objects. It demonstrates how this unified mechanism can be implemented in Linux as an extension of the futex mechanism to allow notification from kernel-space. Required modifications are discussed and their impact is assessed. The new event notification mechanism allows to move many thread activation policy decisions into user-space, with benefits for multi-threaded reactive applications: This is demonstrated in a modification of the leader/followers pattern with considerable performance benefits.", acknowledgement = ack-nhfb, fjournal = "ACM SIGOPS Operating Systems Review", keywords = "event notification; followers; futex; leader {\&} synchronization", } @Article{Boehm:2008:FCC, author = "Hans-J. Boehm and Sarita V. Adve", title = "Foundations of the {C++} concurrency memory model", journal = j-SIGPLAN, volume = "43", number = "6", pages = "68--78", month = jun, year = "2008", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1379022.1375591", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 11:04:53 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Currently multi-threaded C or C++ programs combine a single-threaded programming language with a separate threads library. This is not entirely sound [7].\par We describe an effort, currently nearing completion, to address these issues by explicitly providing semantics for threads in the next revision of the C++ standard. Our approach is similar to that recently followed by Java [25], in that, at least for a well-defined and interesting subset of the language, we give sequentially consistent semantics to programs that do not contain data races. Nonetheless, a number of our decisions are often surprising even to those familiar with the Java effort:\par We (mostly) insist on sequential consistency for race-free programs, in spite of implementation issues that came to light after the Java work.\par We give no semantics to programs with data races. There are no benign C++ data races.\par We use weaker semantics for trylock than existing languages or libraries, allowing us to promise sequential consistency with an intuitive race definition, even for programs with trylock.\par This paper describes the simple model we would like to be able to provide for C++ threads programmers, and explain how this, together with some practical, but often under-appreciated implementation constraints, drives us towards the above decisions.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "c++; data race; memory consistency; memory model; sequential consistency; trylock", } @Article{Boneti:2008:SCP, author = "Carlos Boneti and Francisco J. Cazorla and Roberto Gioiosa and Alper Buyuktosunoglu and Chen-Yong Cher and Mateo Valero", title = "Software-Controlled Priority Characterization of {POWER5} Processor", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "415--426", month = jun, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1109/ISCA.2008.8", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Due to the limitations of instruction-level parallelism, thread-level parallelism has become a popular way to improve processor performance. One example is the IBM POWER5TM processor, a two-context simultaneous-multithreaded dual-core chip. In each SMT core, the IBM POWER5 features two levels of thread resource balancing and prioritization. The first level provides automatic in-hardware resource balancing, while the second level is a software-controlled priority mechanism that presents eight levels of thread priorities. Currently, software-controlled prioritization is only used in limited number of cases in the software platforms due to lack of performance characterization of the effects of this mechanism. In this work, we characterize the effects of the software-based prioritization on several different workloads. We show that the impact of the prioritization significantly depends on the workloads coscheduled on a core. By prioritizing the right task, it is possible to obtain more than two times of throughput improvement for synthetic workloads compared to the baseline. We also present two application case studies targeting two different performance metrics: the first case study improves overall throughput by 23.7\% and the second case study reduces the total execution time by 9.3\%. In addition, we show the circumstances when a background thread can be run transparently without affecting the performance of the foreground thread.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", keywords = "IBM POWER5; performance characterization; simultaneous multithreading; SMT; software-controlled prioritization", } @Article{Campanoni:2008:PDC, author = "Simone Campanoni and Giovanni Agosta and Stefano Crespi Reghizzi", title = "A parallel dynamic compiler for {CIL} bytecode", journal = j-SIGPLAN, volume = "43", number = "4", pages = "11--20", month = apr, year = "2008", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1374752.1374754", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 11:04:46 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Multi-core technology is being employed in most recent high-performance architectures. Such architectures need specifically designed multi-threaded software to exploit all the potentialities of their hardware parallelism.\par At the same time, object code virtualization technologies are achieving a growing popularity, as they allow higher levels of software portability and reuse.\par Thus, a virtual execution environment running on a multi-core processor has to run complex, high-level applications and to exploit as much as possible the underlying parallel hardware. We propose an approach that leverages on CMP features to expose a novel pipeline synchronization model for the internal threads of the dynamic compiler.\par Thanks to compilation latency masking effect of the pipeline organization, our dynamic compiler, ILDJIT, is able to achieve significant speedups (26\% on average) with respect to the baseline, when the underlying hardware exposes at least two cores.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "dynamic compilation; parallel virtual machine; virtual execution system", } @Article{Choi:2008:ABP, author = "Bumyong Choi and Leo Porter and Dean M. Tullsen", title = "Accurate branch prediction for short threads", journal = j-OPER-SYS-REV, volume = "42", number = "2", pages = "125--134", month = mar, year = "2008", CODEN = "OSRED8", DOI = "https://doi.org/10.1145/1353534.1346298", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Fri Jun 20 17:20:12 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/opersysrev.bib", abstract = "Multi-core processors, with low communication costs and high availability of execution cores, will increase the use of execution and compilation models that use short threads to expose parallelism. Current branch predictors seek to incorporate large amounts of control flow history to maximize accuracy. However, when that history is absent the predictor fails to work as intended. Thus, modern predictors are almost useless for threads below a certain length.\par Using a Speculative Multithreaded (SpMT) architecture as an example of a system which generates shorter threads, this work examines techniques to improve branch prediction accuracy when a new thread begins to execute on a different core. This paper proposes a minor change to the branch predictor that gives virtually the same performance on short threads as an idealized predictor that incorporates unknowable pre-history of a spawned speculative thread. At the same time, strong performance on long threads is preserved. The proposed technique sets the global history register of the spawned thread to the initial value of the program counter. This novel and simple design reduces branch mispredicts by 29\% and provides as much as a 13\% IPC improvement on selected SPEC2000 benchmarks.", acknowledgement = ack-nhfb, fjournal = "ACM SIGOPS Operating Systems Review", keywords = "branch prediction; chip multiprocessors", } @Article{Chugh:2008:DAC, author = "Ravi Chugh and Jan W. Voung and Ranjit Jhala and Sorin Lerner", title = "Dataflow analysis for concurrent programs using datarace detection", journal = j-SIGPLAN, volume = "43", number = "6", pages = "316--326", month = jun, year = "2008", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1375581.1375620", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 11:04:53 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Dataflow analyses for concurrent programs differ from their single-threaded counterparts in that they must account for shared memory locations being overwritten by concurrent threads. Existing dataflow analysis techniques for concurrent programs typically fall at either end of a spectrum: at one end, the analysis conservatively kills facts about all data that might possibly be shared by multiple threads; at the other end, a precise thread-interleaving analysis determines which data may be shared, and thus which dataflow facts must be invalidated. The former approach can suffer from imprecision, whereas the latter does not scale.\par We present RADAR, a framework that automatically converts a dataflow analysis for sequential programs into one that is correct for concurrent programs. RADAR uses a race detection engine to kill the dataflow facts, generated and propagated by the sequential analysis, that become invalid due to concurrent writes. Our approach of factoring all reasoning about concurrency into a race detection engine yields two benefits. First, to obtain analyses for code using new concurrency constructs, one need only design a suitable race detection engine for the constructs. Second, it gives analysis designers an easy way to tune the scalability and precision of the overall analysis by only modifying the race detection engine. We describe the RADAR framework and its implementation using a pre-existing race detection engine. We show how RADAR was used to generate a concurrent version of a null-pointer dereference analysis, and we analyze the result of running the generated concurrent analysis on several benchmarks.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "interprocedural analysis; locksets; multithreaded programs; summaries", } @Article{Curtis-Maury:2008:PBP, author = "Matthew Curtis-Maury and Filip Blagojevic and Christos D. Antonopoulos and Dimitrios S. Nikolopoulos", title = "Prediction-Based Power-Performance Adaptation of Multithreaded Scientific Codes", journal = j-IEEE-TRANS-PAR-DIST-SYS, volume = "19", number = "10", pages = "1396--1410", month = oct, year = "2008", CODEN = "ITDSEO", DOI = "https://doi.org/10.1109/TPDS.2007.70804", ISSN = "1045-9219 (print), 1558-2183 (electronic)", ISSN-L = "1045-9219", bibdate = "Thu May 13 12:06:56 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Parallel and Distributed Systems", journal-URL = "http://www.computer.org/tpds/archives.htm", } @Article{Fekete:2008:TSD, author = "Alan D. Fekete", title = "Teaching students to develop thread-safe {Java} classes", journal = j-SIGCSE, volume = "40", number = "3", pages = "119--123", month = sep, year = "2008", CODEN = "SIGSD3", DOI = "https://doi.org/10.1145/1597849.1384304", ISSN = "0097-8418 (print), 2331-3927 (electronic)", ISSN-L = "0097-8418", bibdate = "Sat Nov 17 15:44:14 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/csharp.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigcse2000.bib", note = "Proceedings of ITiCSE '08.", abstract = "Concurrent programming was once the preserve of experts writing systems internals; but recently the growing importance of application servers, and the excellent support in Java and C\# for thread handling, has brought threads and locking as topics that every software developer might experience, and therefore every computer science graduate ought to know. In this paper we report on several years of experience teaching this material in the early years of the curriculum. We focus on one aspect of multi-threaded code, namely how to write sensible thread-safe classes. We identify the learning outcomes we aim to deliver, and we discuss the main pedagogic difficulties students find. We present some examples that can help students avoid common erroneous views.", acknowledgement = ack-nhfb, fjournal = "SIGCSE Bulletin (ACM Special Interest Group on Computer Science Education)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J688", } @Article{Fide:2008:PUS, author = "S. Fide and S. Jenks", title = "Proactive Use of Shared {L3} Caches to Enhance Cache Communications in Multi-Core Processors", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "7", number = "2", pages = "57--60", month = jul, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2008.10", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The software and hardware techniques to exploit the potential of multi-core processors are falling behind, even though the number of cores and cache levels per chip is increasing rapidly. There is no explicit communications support available, and hence inter-core communications depend on cache coherence protocols, resulting in demand-based cache line transfers with their inherent latency and overhead. In this paper, we present software controlled eviction (SCE) to improve the performance of multithreaded applications running on multi-core processors by moving shared data to shared cache levels before it is demanded from remote private caches. Simulation results show that SCE offers significant performance improvement (8-28\%) and reduces L3 cache misses by 88-98\%.", acknowledgement = ack-nhfb, fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "cache coherence protocol; cache communication; cache storage; Concurrent computing; Control systems; Degradation; Delay; demand-based cache line transfer; Hardware; intercore communications; microprocessor chips; Multi-core/single-chip multiprocessors; multi-threading; Multicore processing; multicore processors; multithreaded application; Parallel processing; Protocols; shared L3 cache; shared memory systems; software controlled eviction; Software performance; Support for multi-threaded execution", } @Article{Flanagan:2008:ADA, author = "Cormac Flanagan and Stephen N. Freund", title = "{Atomizer}: a dynamic atomicity checker for multithreaded programs", journal = j-SCI-COMPUT-PROGRAM, volume = "71", number = "2", pages = "89--109", day = "1", month = apr, year = "2008", CODEN = "SCPGD4", ISSN = "0167-6423 (print), 1872-7964 (electronic)", ISSN-L = "0167-6423", bibdate = "Fri Apr 1 18:39:19 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.sciencedirect.com/science/journal/01676423", acknowledgement = ack-nhfb, fjournal = "Science of Computer Programming", journal-URL = "http://www.sciencedirect.com/science/journal/01676423", } @Article{Flanagan:2008:TAS, author = "Cormac Flanagan and Stephen N. Freund and Marina Lifshin and Shaz Qadeer", title = "Types for atomicity: {Static} checking and inference for {Java}", journal = j-TOPLAS, volume = "30", number = "4", pages = "20:1--20:52", month = jul, year = "2008", CODEN = "ATPSDT", DOI = "https://doi.org/10.1145/1377492.1377495", ISSN = "0164-0925 (print), 1558-4593 (electronic)", ISSN-L = "0164-0925", bibdate = "Tue Aug 5 19:14:53 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/toplas/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Atomicity is a fundamental correctness property in multithreaded programs. A method is atomic if, for every execution, there is an equivalent serial execution in which the actions of the method are not interleaved with actions of other threads. Atomic methods are amenable to sequential reasoning, which significantly facilitates subsequent analysis and verification.\par This article presents a type system for specifying and verifying the atomicity of methods in multithreaded Java programs using a synthesis of Lipton's theory of reduction and type systems for race detection. The type system supports guarded, write-guarded, and unguarded fields, as well as thread-local data, parameterized classes and methods, and protected locks. We also present an algorithm for verifying atomicity via type inference.\par We have applied our type checker and type inference tools to a number of commonly used Java library classes and programs. These tools were able to verify the vast majority of methods in these benchmarks as atomic, indicating that atomicity is a widespread methodology for multithreaded programming. In addition, reported atomicity violations revealed some subtle errors in the synchronization disciplines of these programs.", acknowledgement = ack-nhfb, articleno = "20", fjournal = "ACM Transactions on Programming Languages and Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783", keywords = "Atomicity; concurrent programs; type inference; type systems", } @Article{Flanagan:2008:VSC, author = "Cormac Flanagan and Stephen N. Freund and Jaeheon Yi", title = "{Velodrome}: a sound and complete dynamic atomicity checker for multithreaded programs", journal = j-SIGPLAN, volume = "43", number = "6", pages = "293--303", month = jun, year = "2008", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1375581.1375618", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 11:04:53 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Atomicity is a fundamental correctness property in multithreaded programs, both because atomic code blocks are amenable to sequential reasoning (which significantly simplifies correctness arguments), and because atomicity violations often reveal defects in a program's synchronization structure. Unfortunately, all atomicity analyses developed to date are incomplete in that they may yield false alarms on correctly synchronized programs, which limits their usefulness.\par We present the first dynamic analysis for atomicity that is both sound and complete. The analysis reasons about the exact dependencies between operations in the observed trace of the target program, and it reports error messages if and only if the observed trace is not conflict-serializable. Despite this significant increase in precision, the performance and coverage of our analysis is competitive with earlier incomplete dynamic analyses for atomicity.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "atomicity; dynamic analysis; serializability", } @Article{Gidenstam:2008:LLF, author = "Anders Gidenstam and Marina Papatriantafilou", title = "{LFTHREADS}: a lock-free thread library", journal = j-COMP-ARCH-NEWS, volume = "36", number = "5", pages = "88--92", month = dec, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1556444.1556456", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Jun 26 11:50:56 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This extended abstract presents LFTHREADS, a thread library entirely based on lock-free methods, i.e. no spinlocks or similar synchronization mechanisms are employed in the implementation of the multithreading. Since lockfreedom is highly desirable in multiprocessors/multicores due to its advantages in parallelism, fault-tolerance, convoy-avoidance and more, there is an increased demand in lock-free methods in parallel applications, hence also in multiprocessor/multicore system services. LFTHREADS is the first thread library that provides a lock-free implementation of blocking synchronization primitives for application threads; although the latter may sound like a contradicting goal, such objects have several benefits: e.g. library operations that block and unblock threads on the same synchronization object can make progress in parallel while maintaining the desired thread-level semantics and without having to wait for any 'low' operations among them. Besides, as no spin-locks or similar synchronization mechanisms are employed, memory contention can be reduced and processors/cores are able to do useful work. As a consequence, applications, too, can enjoy enhanced parallelism and fault-tolerance. For the synchronization in LFTHREADS we have introduced a new method, which we call responsibility hand-off (RHO), that does not need any special kernel support. The RHO method is also of independent interest, as it can also serve as a tool for lock-free token passing, management of contention and interaction between scheduling and synchronization. This paper gives an outline and the context of LFTHREADS. For more details the reader is referred to [7] and [8].", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Gravvanis:2008:JMB, author = "George A. Gravvanis and Victor N. Epitropou", title = "{Java} multithreading-based parallel approximate arrow-type inverses", journal = j-CCPE, volume = "20", number = "10", pages = "1151--1172", month = jul, year = "2008", CODEN = "CCPEBO", DOI = "https://doi.org/10.1002/cpe.1262", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Mon Dec 5 10:08:25 MST 2011", bibsource = "http://www.interscience.wiley.com/jpages/1532-0626; https://www.math.utah.edu/pub/tex/bib/ccpe.bib; https://www.math.utah.edu/pub/tex/bib/java2000.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Concurrency and Computation: Prac\-tice and Experience", journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626", onlinedate = "18 Sep 2007", } @Article{Hassanein:2008:AEH, author = "Wessam M. Hassanein and Layali K. Rashid and Moustafa A. Hammad", title = "Analyzing the Effects of Hyperthreading on the Performance of Data Management Systems", journal = j-INT-J-PARALLEL-PROG, volume = "36", number = "2", pages = "206--225", month = apr, year = "2008", CODEN = "IJPPE5", DOI = "https://doi.org/10.1007/s10766-007-0066-x", ISSN = "0885-7458 (print), 1573-7640 (electronic)", ISSN-L = "0885-7458", bibdate = "Wed Jul 9 16:07:03 MDT 2008", bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=36&issue=2; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=36&issue=2&spage=206", acknowledgement = ack-nhfb, fjournal = "International Journal of Parallel Programming", journal-URL = "http://link.springer.com/journal/10766", keywords = "Data management systems; Databases; Hyper-threaded architectures; Performance; Simultaneous multithreading", } @Article{He:2008:COD, author = "Bingsheng He and Qiong Luo", title = "Cache-oblivious databases: {Limitations} and opportunities", journal = j-TODS, volume = "33", number = "2", pages = "8:1--8:??", month = jun, year = "2008", CODEN = "ATDSD3", DOI = "https://doi.org/10.1145/1366102.1366105", ISSN = "0362-5915 (print), 1557-4644 (electronic)", ISSN-L = "0362-5915", bibdate = "Wed Jun 25 08:39:17 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/tods/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Cache-oblivious techniques, proposed in the theory community, have optimal asymptotic bounds on the amount of data transferred between any two adjacent levels of an arbitrary memory hierarchy. Moreover, this optimal performance is achieved without any hardware platform specific tuning. These properties are highly attractive to autonomous databases, especially because the hardware architectures are becoming increasingly complex and diverse.\par In this article, we present our design, implementation, and evaluation of the first cache-oblivious in-memory query processor, EaseDB. Moreover, we discuss the inherent limitations of the cache-oblivious approach as well as the opportunities given by the upcoming hardware architectures. Specifically, a cache-oblivious technique usually requires sophisticated algorithm design to achieve a comparable performance to its cache-conscious counterpart. Nevertheless, this development-time effort is compensated by the automaticity of performance achievement and the reduced ownership cost. Furthermore, this automaticity enables cache-oblivious techniques to outperform their cache-conscious counterparts in multi-threading processors.", acknowledgement = ack-nhfb, articleno = "8", fjournal = "ACM Transactions on Database Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J777", keywords = "cache-conscious; cache-oblivious; chip multiprocessors; data caches; simultaneous multithreading", } @Article{Jacobs:2008:PMC, author = "Bart Jacobs and Frank Piessens and Jan Smans and K. Rustan M. Leino and Wolfram Schulte", title = "A programming model for concurrent object-oriented programs", journal = j-TOPLAS, volume = "31", number = "1", pages = "1:1--1:48", month = dec, year = "2008", CODEN = "ATPSDT", DOI = "https://doi.org/10.1145/1452044.1452045", ISSN = "0164-0925 (print), 1558-4593 (electronic)", ISSN-L = "0164-0925", bibdate = "Tue Dec 23 11:52:52 MST 2008", bibsource = "http://www.acm.org/pubs/contents/journals/toplas/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Reasoning about multithreaded object-oriented programs is difficult, due to the nonlocal nature of object aliasing and data races. We propose a programming regime (or {\em programming model\/}) that rules out data races, and enables local reasoning in the presence of object aliasing and concurrency. Our programming model builds on the multithreading and synchronization primitives as they are present in current mainstream programming languages. Java or C\\# programs developed according to our model can be annotated by means of stylized comments to make the use of the model explicit. We show that such annotated programs can be formally verified to comply with the programming model. If the annotated program verifies, the underlying Java or C\\# program is guaranteed to be free from data races, and it is sound to reason locally about program behavior. Verification is modular: a program is valid if all methods are valid, and validity of a method does not depend on program elements that are not visible to the method. We have implemented a verifier for programs developed according to our model in a custom build of the Spec\\# programming system, and we have validated our approach on a case study.", acknowledgement = ack-nhfb, articleno = "1", fjournal = "ACM Transactions on Programming Languages and Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783", keywords = "Aliasing; data races; local reasoning; modular reasoning; ownership; verification condition generation", } @Article{Jaisson:2008:IPM, author = "Pascal Jaisson and Florian {De Vuyst}", title = "An innovating {PDE} model based on fluid flow paradigm for multithread systems", journal = j-COMP-NET-AMSTERDAM, volume = "52", number = "18", pages = "3318--3324", day = "22", month = dec, year = "2008", CODEN = "????", ISSN = "1389-1286 (print), 1872-7069 (electronic)", ISSN-L = "1389-1286", bibdate = "Sat Apr 2 08:42:29 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.sciencedirect.com/science/journal/13891286", acknowledgement = ack-nhfb, fjournal = "Computer Networks (Amsterdam, Netherlands: 1999)", journal-URL = "http://www.sciencedirect.com/science/journal/13891286", } @Article{Kang:2008:ISE, author = "Dongsoo Kang and Chen Liu and Jean-Luc Gaudiot", title = "The Impact of Speculative Execution on {SMT} Processors", journal = j-INT-J-PARALLEL-PROG, volume = "36", number = "4", pages = "361--385", month = aug, year = "2008", CODEN = "IJPPE5", DOI = "https://doi.org/10.1007/s10766-007-0052-3", ISSN = "0885-7458 (print), 1573-7640 (electronic)", ISSN-L = "0885-7458", bibdate = "Wed Jul 9 16:07:14 MDT 2008", bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=36&issue=4; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=36&issue=4&spage=361", acknowledgement = ack-nhfb, fjournal = "International Journal of Parallel Programming", journal-URL = "http://link.springer.com/journal/10766", keywords = "Confidence estimator; Simultaneous multithreading; Speculation control; Thread scheduling", } @Article{Kgil:2008:PUS, author = "Taeho Kgil and Ali Saidi and Nathan Binkert and Steve Reinhardt and Krisztian Flautner and Trevor Mudge", title = "{PicoServer}: {Using} {$3$D} stacking technology to build energy efficient servers", journal = j-JETC, volume = "4", number = "4", pages = "16:1--16:??", month = oct, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1412587.1412589", ISSN = "1550-4832", ISSN-L = "1550-4832", bibdate = "Wed Mar 17 14:22:55 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/jetc/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This article extends our prior work to show that a straightforward use of 3D stacking technology enables the design of compact energy-efficient servers. Our proposed architecture, called PicoServer, employs 3D technology to bond one die containing several simple, slow processing cores to multiple memory dies sufficient for a primary memory. The multiple memory dies are composed of DRAM. This use of 3D stacks readily facilitates wide low-latency buses between processors and memory. These remove the need for an L2 cache allowing its area to be re-allocated to additional simple cores. The additional cores allow the clock frequency to be lowered without impairing throughput. Lower clock frequency means that thermal constraints, a concern with 3D stacking, are easily satisfied. We extend our original analysis on PicoServer to include: (1) a wider set of server workloads, (2) the impact of multithreading, and (3) the on-chip DRAM architecture and system memory usage. PicoServer is intentionally simple, requiring only the simplest form of 3D technology where die are stacked on top of one another. Our intent is to minimize risk of introducing a new technology (3D) to implement a class of low-cost, low-power compact server architectures.", acknowledgement = ack-nhfb, articleno = "16", fjournal = "ACM Journal on Emerging Technologies in Computing Systems (JETC)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967", keywords = "3D stacking technology; chip multiprocessor; full-system simulation; Low power; Tier-1/2/3 server", } @Article{Krashinsky:2008:ISV, author = "Ronny Krashinsky and Christopher Batten and Krste Asanovi{\'c}", title = "Implementing the {Scale} vector-thread processor", journal = j-TODAES, volume = "13", number = "3", pages = "41:1--41:??", month = jul, year = "2008", CODEN = "ATASFO", DOI = "https://doi.org/10.1145/1367045.1367050", ISSN = "1084-4309 (print), 1557-7309 (electronic)", ISSN-L = "1084-4309", bibdate = "Tue Aug 5 18:41:27 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/todaes/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The Scale vector-thread processor is a complexity-effective solution for embedded computing which flexibly supports both vector and highly multithreaded processing. The 7.1-million transistor chip has 16 decoupled execution clusters, vector load and store units, and a nonblocking 32KB cache. An automated and iterative design and verification flow enabled a performance-, power-, and area-efficient implementation with two person-years of development effort. Scale has a core area of 16.6 mm$^2$ in 180 nm technology, and it consumes 400 mW--1.1 W while running at 260 MHz.", acknowledgement = ack-nhfb, articleno = "41", fjournal = "ACM Transactions on Design Automation of Electronic Systems (TODAES)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J776", keywords = "hybrid C++/Verilog simulation; iterative VLSI design flow; multithreaded processors; procedural datapath pre-placement; vector processors; vector-thread processors", } @Article{Kumar:2008:AVO, author = "Sanjeev Kumar and Daehyun Kim and Mikhail Smelyanskiy and Yen-Kuang Chen and Jatin Chhugani and Christopher J. Hughes and Changkyu Kim and Victor W. Lee and Anthony D. Nguyen", title = "Atomic Vector Operations on Chip Multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "441--452", month = jun, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1394608.1382154", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The current trend is for processors to deliver dramatic improvements in parallel performance while only modestly improving serial performance. Parallel performance is harvested through vector/SIMD instructions as well as multithreading (through both multithreaded cores and chip multiprocessors). Vector parallelism can be more efficiently supported than multithreading, but is often harder for software to exploit. In particular, code with sparse data access patterns cannot easily utilize the vector/SIMD instructions of mainstream processors. Hardware to scatter and gather sparse data has previously been proposed to enable vector execution for these codes. However, on multithreaded architectures, a number of applications spend significant time on atomic operations (e.g., parallel reductions), which cannot be vectorized using previously proposed schemes. This paper proposes architectural support for atomic vector operations (referred to as GLSC) that addresses this limitation. GLSC extends scatter-gather hardware to support atomic memory operations. Our experiments show that the GLSC provides an average performance improvement on a set of important RMS kernels of 54\% for 4-wide SIMD.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", keywords = "locks; multiprocessors; reductions; SIMD; vector", } @Article{Li:2008:TAN, author = "Z. Li and C. Zhu and L. Shang and R. Dick and Y. Sun", title = "Transaction-Aware Network-on-Chip Resource Reservation", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "7", number = "2", pages = "53--56", month = jul, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2008.9", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Fri Jun 21 05:49:19 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Performance and scalability are critically-important for on-chip interconnect in many-core chip-multiprocessor systems. Packet-switched interconnect fabric, widely viewed as the de facto on-chip data communication backplane in the many-core era, offers high throughput and excellent scalability. However, these benefits come at the price of router latency due to run-time multi-hop data buffering and resource arbitration. The network accounts for a majority of on-chip data transaction latency. In this work, we propose dynamic in-network resource reservation techniques to optimize run-time on-chip data transactions. This idea is motivated by the need to preserve existing abstraction and general-purpose network performance while optimizing for frequently-occurring network events such as data transactions. Experimental studies using multithreaded benchmarks demonstrate that the proposed techniques can reduce on-chip data access latency by 28.4\% on average in a 16-node system and 29.2\% on average in a 36-node system.", acknowledgement = ack-nhfb, fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Backplanes; buffer storage; Computer buffers; data communication; Data communication; de facto on-chip data communication backplane; Delay; dynamic in-network resource reservation techniques; Fabrics; frequently-occurring network events; Interconnection architectures; Interconnections (Subsystems); many-core chip-multiprocessor systems; multiprocessor interconnection networks; Network-on-a-chip; on-chip data transaction latency; On-chip interconnection networks; packet switching; packet-switched interconnect fabric; Parallel Architectures; resource allocation; router latency; run-time multihop data buffering; Runtime; Scalability; System-on-a-chip; telecommunication network routing; Throughput; transaction-aware network-on-chip resource reservation", } @Article{Liu:2008:HPP, author = "Duo Liu and Zheng Chen and Bei Hua and Nenghai Yu and Xinan Tang", title = "High-performance packet classification algorithm for multithreaded {IXP} network processor", journal = j-TECS, volume = "7", number = "2", pages = "16:1--16:??", month = feb, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1331331.1331340", ISSN = "1539-9087 (print), 1558-3465 (electronic)", ISSN-L = "1539-9087", bibdate = "Thu Jun 12 15:22:00 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Packet classification is crucial for the Internet to provide more value-added services and guaranteed quality of service. Besides hardware-based solutions, many software-based classification algorithms have been proposed. However, classifying at 10 Gbps speed or higher is a challenging problem and it is still one of the performance bottlenecks in core routers. In general, classification algorithms face the same challenge of balancing between high classification speed and low memory requirements. This paper proposes a modified recursive flow classification (RFC) algorithm, Bitmap-RFC, which significantly reduces the memory requirements of RFC by applying a bitmap compression technique. To speed up classifying speed, we exploit the multithreaded architectural features in various algorithm development stages from algorithm design to algorithm implementation. As a result, Bitmap-RFC strikes a good balance between speed and space. It can significantly keep both high classification speed and reduce memory space consumption. This paper investigates the main NPU software design aspects that have dramatic performance impacts on any NPU-based implementations: memory space reduction, instruction selection, data allocation, task partitioning, and latency hiding. We experiment with an architecture-aware design principle to guarantee the high performance of the classification algorithm on an NPU implementation. The experimental results show that the Bitmap-RFC algorithm achieves 10 Gbps speed or higher and has a good scalability on Intel IXP2800 NPU.", acknowledgement = ack-nhfb, articleno = "16", fjournal = "ACM Transactions on Embedded Computing Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J840", keywords = "architecture; embedded system design; multithreading; network processor; packet classification; thread-level parallelism", } @Article{Madriles:2008:MSM, author = "Carlos Madriles and Carlos Garc{\'\i}a-Qui{\~n}ones and Jes{\'u}s S{\'a}nchez and Pedro Marcuello and Antonio Gonz{\'a}lez and Dean M. Tullsen and Hong Wang and John P. Shen", title = "{Mitosis}: a Speculative Multithreaded Processor Based on Precomputation Slices", journal = j-IEEE-TRANS-PAR-DIST-SYS, volume = "19", number = "7", pages = "914--925", month = jul, year = "2008", CODEN = "ITDSEO", DOI = "https://doi.org/10.1109/TPDS.2007.70797", ISSN = "1045-9219 (print), 1558-2183 (electronic)", ISSN-L = "1045-9219", bibdate = "Thu Jul 3 12:41:00 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Parallel and Distributed Systems", journal-URL = "http://www.computer.org/tpds/archives.htm", } @Article{Montesinos:2008:DRD, author = "Pablo Montesinos and Luis Ceze and Josep Torrellas", title = "{DeLorean}: Recording and Deterministically Replaying Shared-Memory Multiprocessor Execution Efficiently", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "289--300", month = jun, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1109/ISCA.2008.36", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Support for deterministic replay of multithreaded execution can greatly help in finding concurrency bugs. For highest effectiveness, replay schemes should (i) record at production-run speed, (ii) keep their logging requirements minute, and (iii) replay at a speed similar to that of the initial execution. In this paper, we propose a new substrate for deterministic replay that provides substantial advances along these axes. In our proposal, processors execute blocks of instructions atomically, as in transactional memory or speculative multithreading, and the system only needs to record the commit order of these blocks. We call our scheme DeLorean. Our results show that DeLorean records execution at a speed similar to that of Release Consistency (RC) execution and replays at about 82\% of its speed. In contrast, most current schemes only record at the speed of Sequential Consistency (SC) execution. Moreover, DeLorean only needs 7.5\% of the log size needed by a state-of-the-art scheme. Finally, DeLorean can be configured to need only 0.6\% of the log size of the state-of-the-art scheme at the cost of recording at 86\% of RC's execution speed --- still faster than SC. In this configuration, the log of an 8-processor 5-GHz machine is estimated to be only about 20GB per day.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Musuvathi:2008:FSM, author = "Madanlal Musuvathi and Shaz Qadeer", title = "Fair stateless model checking", journal = j-SIGPLAN, volume = "43", number = "6", pages = "362--371", month = jun, year = "2008", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1379022.1375625", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 11:04:53 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Stateless model checking is a useful state-space exploration technique for systematically testing complex real-world software. Existing stateless model checkers are limited to the verification of safety properties on terminating programs. However, realistic concurrent programs are nonterminating, a property that significantly reduces the efficacy of stateless model checking in testing them. Moreover, existing stateless model checkers are unable to verify that a nonterminating program satisfies the important liveness property of livelock-freedom, a property that requires the program to make continuous progress for any input.\par To address these shortcomings, this paper argues for incorporating a fair scheduler in stateless exploration. The key contribution of this paper is an explicit scheduler that is (strongly) fair and at the same time sufficiently nondeterministic to guarantee full coverage of safety properties. We have implemented the fair scheduler in the CHESS model checker. We show through theoretical arguments and empirical evaluation that our algorithm satisfies two important properties: (1) it visits all states of a finite-state program achieving state coverage at a faster rate than existing techniques, and (2) it finds all livelocks in a finite-state program. Before this work, nonterminating programs had to be manually modified in order to apply CHESS to them. The addition of fairness has allowed CHESS to be effectively applied to real-world nonterminating programs without any modification. For example, we have successfully booted the Singularity operating system under the control of CHESS.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "concurrency; fairness; liveness; model checking; multi-threading; shared-memory programs; software testing", } @Article{Neamtiu:2008:CEV, author = "Iulian Neamtiu and Michael Hicks and Jeffrey S. Foster and Polyvios Pratikakis", title = "Contextual effects for version-consistent dynamic software updating all and safe concurrent programming", journal = j-SIGPLAN, volume = "43", number = "1", pages = "37--49", month = jan, year = "2008", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1328897.1328447", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 11:02:13 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This paper presents a generalization of standard effect systems that we call contextual effects. A traditional effect system computes the effect of an expression e. Our system additionally computes the effects of the computational context in which $e$ occurs. More specifically, we compute the effect of the computation that has already occurred(the prior effect) and the effect of the computation yet to take place (the future effect).\par Contextual effects are useful when the past or future computation of the program is relevant at various program points. We present two substantial examples. First, we show how prior and future effects can be used to enforce transactional version consistency (TVC), a novel correctness property for dynamic software updates. TV Censures that programmer-designated transactional code blocks appear to execute entirely at the same code version, even if a dynamic update occurs in the middle of the block. Second, we show how future effects can be used in the analysis of multi-threaded programs to find thread-shared locations. This is an essential step in applications such as data race detection.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "computation effects; contextual effects; data race detection; dynamic software updating; type and effect systems; version consistency", } @Article{Ottoni:2008:COGa, author = "Guilherme Ottoni and David I. August", title = "Communication optimizations for global multi-threaded instruction scheduling", journal = j-COMP-ARCH-NEWS, volume = "36", number = "1", pages = "222--232", month = mar, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1353535.1346310", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:51:35 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The recent shift in the industry towards chip multiprocessor (CMP) designs has brought the need for multi-threaded applications to mainstream computing. As observed in several limit studies, most of the parallelization opportunities require looking for parallelism beyond local regions of code. To exploit these opportunities, especially for sequential applications, researchers have recently proposed global multi-threaded instruction scheduling techniques, including DSWP and GREMIO. These techniques simultaneously schedule instructions from large regions of code, such as arbitrary loop nests or whole procedures, and have been shown to be effective at extracting threads for many applications. A key enabler of these global instruction scheduling techniques is the Multi-Threaded Code Generation (MTCG) algorithm proposed in [16], which generates multi-threaded code for any partition of the instructions into threads. This algorithm inserts communication and synchronization instructions in order to satisfy all inter-thread dependences.\par In this paper, we present a general compiler framework, COCO, to optimize the communication and synchronization instructions inserted by the MTCG algorithm. This framework, based on thread-aware data-flow analyses and graph min-cut algorithms, appropriately models and optimizes all kinds of inter-thread dependences, including register, memory, and control dependences. Our experiments, using a fully automatic compiler implementation of these techniques, demonstrate significant reductions (about 30\% on average) in the number of dynamic communication instructions in code parallelized with DSWP and GREMIO. This reduction in communication translates to performance gains of up to 40\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", keywords = "communication; data-flow analysis; graph min-cut; instruction scheduling; multi-threading; synchronization", } @Article{Ottoni:2008:COGb, author = "Guilherme Ottoni and David I. August", title = "Communication optimizations for global multi-threaded instruction scheduling", journal = j-OPER-SYS-REV, volume = "42", number = "2", pages = "222--232", month = mar, year = "2008", CODEN = "OSRED8", DOI = "https://doi.org/10.1145/1353535.1346310", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Fri Jun 20 17:20:12 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The recent shift in the industry towards chip multiprocessor (CMP) designs has brought the need for multi-threaded applications to mainstream computing. As observed in several limit studies, most of the parallelization opportunities require looking for parallelism beyond local regions of code. To exploit these opportunities, especially for sequential applications, researchers have recently proposed global multi-threaded instruction scheduling techniques, including DSWP and GREMIO. These techniques simultaneously schedule instructions from large regions of code, such as arbitrary loop nests or whole procedures, and have been shown to be effective at extracting threads for many applications. A key enabler of these global instruction scheduling techniques is the Multi-Threaded Code Generation (MTCG) algorithm proposed in [16], which generates multi-threaded code for any partition of the instructions into threads. This algorithm inserts communication and synchronization instructions in order to satisfy all inter-thread dependences.\par In this paper, we present a general compiler framework, COCO, to optimize the communication and synchronization instructions inserted by the MTCG algorithm. This framework, based on thread-aware data-flow analyses and graph min-cut algorithms, appropriately models and optimizes all kinds of inter-thread dependences, including register, memory, and control dependences. Our experiments, using a fully automatic compiler implementation of these techniques, demonstrate significant reductions (about 30\% on average) in the number of dynamic communication instructions in code parallelized with DSWP and GREMIO. This reduction in communication translates to performance gains of up to 40\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGOPS Operating Systems Review", keywords = "communication; data-flow analysis; graph min-cut; instruction scheduling; multi-threading; synchronization", } @Article{Ottoni:2008:COGc, author = "Guilherme Ottoni and David I. August", title = "Communication optimizations for global multi-threaded instruction scheduling", journal = j-SIGPLAN, volume = "43", number = "3", pages = "222--232", month = mar, year = "2008", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1353535.1346310", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 11:03:40 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The recent shift in the industry towards chip multiprocessor (CMP) designs has brought the need for multi-threaded applications to mainstream computing. As observed in several limit studies, most of the parallelization opportunities require looking for parallelism beyond local regions of code. To exploit these opportunities, especially for sequential applications, researchers have recently proposed global multi-threaded instruction scheduling techniques, including DSWP and GREMIO. These techniques simultaneously schedule instructions from large regions of code, such as arbitrary loop nests or whole procedures, and have been shown to be effective at extracting threads for many applications. A key enabler of these global instruction scheduling techniques is the Multi-Threaded Code Generation (MTCG) algorithm proposed in [16], which generates multi-threaded code for any partition of the instructions into threads. This algorithm inserts communication and synchronization instructions in order to satisfy all inter-thread dependences.\par In this paper, we present a general compiler framework, COCO, to optimize the communication and synchronization instructions inserted by the MTCG algorithm. This framework, based on thread-aware data-flow analyses and graph min-cut algorithms, appropriately models and optimizes all kinds of inter-thread dependences, including register, memory, and control dependences. Our experiments, using a fully automatic compiler implementation of these techniques, demonstrate significant reductions (about 30\% on average) in the number of dynamic communication instructions in code parallelized with DSWP and GREMIO. This reduction in communication translates to performance gains of up to 40\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "communication; data-flow analysis; graph min-cut; instruction scheduling; multi-threading; synchronization", } @Article{Rangan:2008:PSD, author = "Ram Rangan and Neil Vachharajani and Guilherme Ottoni and David I. August", title = "Performance scalability of decoupled software pipelining", journal = j-TACO, volume = "5", number = "2", pages = "8:1--8:??", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1400112.1400113", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Aug 28 13:25:00 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Any successful solution to using multicore processors to scale general-purpose program performance will have to contend with rising intercore communication costs while exposing coarse-grained parallelism. Recently proposed pipelined multithreading (PMT) techniques have been demonstrated to have general-purpose applicability and are also able to effectively tolerate inter-core latencies through pipelined interthread communication. These desirable properties make PMT techniques strong candidates for program parallelization on current and future multicore processors and understanding their performance characteristics is critical to their deployment. To that end, this paper evaluates the performance scalability of a general-purpose PMT technique called decoupled software pipelining (DSWP) and presents a thorough analysis of the communication bottlenecks that must be overcome for optimal DSWP scalability.", acknowledgement = ack-nhfb, articleno = "8", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924", keywords = "decoupled software pipelining; performance analysis", } @Article{Rounce:2008:DIS, author = "Peter A. Rounce and Alberto F. De Souza", title = "Dynamic Instruction Scheduling in a Trace-based Multi-threaded Architecture", journal = j-INT-J-PARALLEL-PROG, volume = "36", number = "2", pages = "184--205", month = apr, year = "2008", CODEN = "IJPPE5", DOI = "https://doi.org/10.1007/s10766-007-0062-1", ISSN = "0885-7458 (print), 1573-7640 (electronic)", ISSN-L = "0885-7458", bibdate = "Wed Jul 9 16:07:03 MDT 2008", bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=36&issue=2; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=36&issue=2&spage=184", acknowledgement = ack-nhfb, fjournal = "International Journal of Parallel Programming", journal-URL = "http://link.springer.com/journal/10766", keywords = "Dynamic instruction scheduling; Simultaneous multi-threading; VLIW; Wide issue architectures", } @Article{Ruan:2008:DCS, author = "Yaoping Ruan and Vivek S. Pai and Erich Nahum and John M. Tracey", title = "Do commodity {SMT} processors need more {OS} research?", journal = j-OPER-SYS-REV, volume = "42", number = "1", pages = "21--25", month = jan, year = "2008", CODEN = "OSRED8", DOI = "https://doi.org/10.1145/1341312.1341318", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Fri Jun 20 17:19:29 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/opersysrev.bib", abstract = "The availability of Simultaneous Multithreading (SMT) in commodity processors such as the Pentium 4 (P4) has raised interest among OS researchers. While earlier simulation studies of SMT suggested exciting performance potential, observed improvement on the P4 has been much more restrained, raising the hope that OS research can help bridge the gap. We argue that OS research for current commodity Simultaneous Multithreading (SMT) processors is unlikely to yield significant benefits. In general, we find that SMT processor simulations were optimistic about cache and memory performance characteristics, while overlooking the OS overheads of SMT kernels versus uniprocessor kernels. Using measurement and analysis on actual hardware, we find that little opportunity exists for realistic performance gains on commodity SMT beyond what is currently achieved.", acknowledgement = ack-nhfb, fjournal = "ACM SIGOPS Operating Systems Review", } @Article{Schaffer:2008:UHM, author = "Kevin Schaffer and Robert A. Walker", title = "Using Hardware Multithreading to Overcome Broadcast\slash Reduction Latency in an Associative {SIMD} Processor", journal = j-PARALLEL-PROCESS-LETT, volume = "18", number = "4", pages = "491--509", month = dec, year = "2008", CODEN = "PPLTEE", DOI = "https://doi.org/10.1142/S0129626408003533", ISSN = "0129-6264 (print), 1793-642X (electronic)", bibdate = "Thu Sep 2 09:08:11 MDT 2010", bibsource = "http://ejournals.wspc.com.sg/ppl/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/parallelprocesslett.bib", acknowledgement = ack-nhfb, fjournal = "Parallel Processing Letters", journal-URL = "http://www.worldscientific.com/loi/ppl", } @Article{Sen:2008:RDR, author = "Koushik Sen", title = "Race directed random testing of concurrent programs", journal = j-SIGPLAN, volume = "43", number = "6", pages = "11--21", month = jun, year = "2008", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1379022.1375584", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 11:04:53 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Bugs in multi-threaded programs often arise due to data races. Numerous static and dynamic program analysis techniques have been proposed to detect data races. We propose a novel randomized dynamic analysis technique that utilizes potential data race information obtained from an existing analysis tool to separate real races from false races without any need for manual inspection. Specifically, we use potential data race information obtained from an existing dynamic analysis technique to control a random scheduler of threads so that real race conditions get created with very high probability and those races get resolved randomly at runtime. Our approach has several advantages over existing dynamic analysis tools. First, we can create a real race condition and resolve the race randomly to see if an error can occur due to the race. Second, we can replay a race revealing execution efficiently by simply using the same seed for random number generation--we do not need to record the execution. Third, our approach has very low overhead compared to other precise dynamic race detection techniques because we only track all synchronization operations and a single pair of memory access statements that are reported to be in a potential race by an existing analysis. We have implemented the technique in a prototype tool for Java and have experimented on a number of large multi-threaded Java programs. We report a number of previously known and unknown bugs and real races in these Java programs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "concurrency; dynamic analysis; race detection; random testing", } @Article{Sharkey:2008:RRP, author = "Joseph J. Sharkey and Jason Loew and Dmitry V. Ponomarev", title = "Reducing register pressure in {SMT} processors through {L2}-miss-driven early register release", journal = j-TACO, volume = "5", number = "3", pages = "13:1--13:??", month = nov, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1455650.1455652", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Dec 8 14:28:18 MST 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The register file is one of the most critical datapath components limiting the number of threads that can be supported on a simultaneous multithreading (SMT) processor. To allow the use of smaller register files without degrading performance, techniques that maximize the efficiency of using registers through aggressive register allocation/deallocation can be considered. In this article, we propose a novel technique to early deallocate physical registers allocated to threads which experience L2 cache misses. This is accomplished by speculatively committing the load-independent instructions and deallocating the registers corresponding to the previous mappings of their destinations, without waiting for the cache miss request to be serviced. The early deallocated registers are then made immediately available for allocation to instructions within the same thread as well as within other threads, thus improving the overall processor throughput. On the average across the simulated mixes of multiprogrammed SPEC 2000 workloads, our technique results in 33\% improvement in throughput and 25\% improvement in terms of harmonic mean of weighted IPCs over the baseline SMT with the state-of-the-art DCRA policy. This is achieved without creating checkpoints, maintaining per-register counters of pending consumers, performing tag rebroadcasts, register remappings, and/or additional associative searches.", acknowledgement = ack-nhfb, articleno = "13", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924", keywords = "register file; Simultaneous multithreading", } @Article{Suleman:2008:FDTa, author = "M. Aater Suleman and Moinuddin K. Qureshi and Yale N. Patt", title = "Feedback-driven threading: power-efficient and high-performance execution of multi-threaded workloads on {CMPs}", journal = j-COMP-ARCH-NEWS, volume = "36", number = "1", pages = "277--286", month = mar, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1346281.1346317", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:51:35 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Extracting high-performance from the emerging Chip Multiprocessors (CMPs) requires that the application be divided into multiple threads. Each thread executes on a separate core thereby increasing concurrency and improving performance. As the number of cores on a CMP continues to increase, the performance of some multi-threaded applications will benefit from the increased number of threads, whereas, the performance of other multi-threaded applications will become limited by data-synchronization and off-chip bandwidth. For applications that get limited by data-synchronization, increasing the number of threads significantly degrades performance and increases on-chip power. Similarly, for applications that get limited by off-chip bandwidth, increasing the number of threads increases on-chip power without providing any performance improvement. Furthermore, whether an application gets limited by data-synchronization, or bandwidth, or neither depends not only on the application but also on the input set and the machine configuration. Therefore, controlling the number of threads based on the run-time behavior of the application can significantly improve performance and reduce power.\par This paper proposes Feedback-Driven Threading (FDT), a framework to dynamically control the number of threads using run-time information. FDT can be used to implement Synchronization-Aware Threading (SAT), which predicts the optimal number of threads depending on the amount of data-synchronization. Our evaluation shows that SAT can reduce both execution time and power by up to 66\% and 78\% respectively. Similarly, FDT can be used to implement Bandwidth-Aware Threading (BAT), which predicts the minimum number of threads required to saturate the off-chip bus. Our evaluation shows that BAT reduces on-chip power by up to 78\%. When SAT and BAT are combined, the average execution time reduces by 17\% and power reduces by 59\%. The proposed techniques leverage existing performance counters and require minimal support from the threading library.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", keywords = "bandwidth; CMP; multi-threaded; synchronization", } @Article{Suleman:2008:FDTb, author = "M. Aater Suleman and Moinuddin K. Qureshi and Yale N. Patt", title = "Feedback-driven threading: power-efficient and high-performance execution of multi-threaded workloads on {CMPs}", journal = j-OPER-SYS-REV, volume = "42", number = "2", pages = "277--286", month = mar, year = "2008", CODEN = "OSRED8", DOI = "https://doi.org/10.1145/1346281.1346317", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Fri Jun 20 17:20:12 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Extracting high-performance from the emerging Chip Multiprocessors (CMPs) requires that the application be divided into multiple threads. Each thread executes on a separate core thereby increasing concurrency and improving performance. As the number of cores on a CMP continues to increase, the performance of some multi-threaded applications will benefit from the increased number of threads, whereas, the performance of other multi-threaded applications will become limited by data-synchronization and off-chip bandwidth. For applications that get limited by data-synchronization, increasing the number of threads significantly degrades performance and increases on-chip power. Similarly, for applications that get limited by off-chip bandwidth, increasing the number of threads increases on-chip power without providing any performance improvement. Furthermore, whether an application gets limited by data-synchronization, or bandwidth, or neither depends not only on the application but also on the input set and the machine configuration. Therefore, controlling the number of threads based on the run-time behavior of the application can significantly improve performance and reduce power.\par This paper proposes Feedback-Driven Threading (FDT), a framework to dynamically control the number of threads using run-time information. FDT can be used to implement Synchronization-Aware Threading (SAT), which predicts the optimal number of threads depending on the amount of data-synchronization. Our evaluation shows that SAT can reduce both execution time and power by up to 66\% and 78\% respectively. Similarly, FDT can be used to implement Bandwidth-Aware Threading (BAT), which predicts the minimum number of threads required to saturate the off-chip bus. Our evaluation shows that BAT reduces on-chip power by up to 78\%. When SAT and BAT are combined, the average execution time reduces by 17\% and power reduces by 59\%. The proposed techniques leverage existing performance counters and require minimal support from the threading library.", acknowledgement = ack-nhfb, fjournal = "ACM SIGOPS Operating Systems Review", keywords = "bandwidth; CMP; multi-threaded; synchronization", } @Article{Suleman:2008:FDTc, author = "M. Aater Suleman and Moinuddin K. Qureshi and Yale N. Patt", title = "Feedback-driven threading: power-efficient and high-performance execution of multi-threaded workloads on {CMPs}", journal = j-SIGPLAN, volume = "43", number = "3", pages = "277--286", month = mar, year = "2008", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1346281.1346317", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 11:03:40 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Extracting high-performance from the emerging Chip Multiprocessors (CMPs) requires that the application be divided into multiple threads. Each thread executes on a separate core thereby increasing concurrency and improving performance. As the number of cores on a CMP continues to increase, the performance of some multi-threaded applications will benefit from the increased number of threads, whereas, the performance of other multi-threaded applications will become limited by data-synchronization and off-chip bandwidth. For applications that get limited by data-synchronization, increasing the number of threads significantly degrades performance and increases on-chip power. Similarly, for applications that get limited by off-chip bandwidth, increasing the number of threads increases on-chip power without providing any performance improvement. Furthermore, whether an application gets limited by data-synchronization, or bandwidth, or neither depends not only on the application but also on the input set and the machine configuration. Therefore, controlling the number of threads based on the run-time behavior of the application can significantly improve performance and reduce power.\par This paper proposes Feedback-Driven Threading (FDT), a framework to dynamically control the number of threads using run-time information. FDT can be used to implement Synchronization-Aware Threading (SAT), which predicts the optimal number of threads depending on the amount of data-synchronization. Our evaluation shows that SAT can reduce both execution time and power by up to 66\% and 78\% respectively. Similarly, FDT can be used to implement Bandwidth-Aware Threading (BAT), which predicts the minimum number of threads required to saturate the off-chip bus. Our evaluation shows that BAT reduces on-chip power by up to 78\%. When SAT and BAT are combined, the average execution time reduces by 17\% and power reduces by 59\%. The proposed techniques leverage existing performance counters and require minimal support from the threading library.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "bandwidth; CMP; multi-threaded; synchronization", } @Article{Thoziyoor:2008:CMM, author = "Shyamkumar Thoziyoor and Jung Ho Ahn and Matteo Monchiero and Jay B. Brockman and Norman P. Jouppi", title = "A Comprehensive Memory Modeling Tool and Its Application to the Design and Analysis of Future Memory Hierarchies", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "51--62", month = jun, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1109/ISCA.2008.16", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "In this paper we introduce CACTI-D, a significant enhancement of CACTI 5.0. CACTI-D adds support for modeling of commodity DRAM technology and support for main memory DRAM chip organization. CACTI-D enables modeling of the complete memory hierarchy with consistent models all the way from SRAM based L1 caches through main memory DRAMs on DIMMs. We illustrate the potential applicability of CACTI-D in the design and analysis of future memory hierarchies by carrying out a last level cache study for a multicore multithreaded architecture at the 32nm technology node. In this study we use CACTI-D to model all components of the memory hierarchy including L1, L2, last level SRAM, logic process based DRAM or commodity DRAM L3 caches, and main memory DRAM chips. We carry out architectural simulation using benchmarks with large data sets and present results of their execution time, breakdown of power in the memory hierarchy, and system energy-delay product for the different system configurations. We find that commodity DRAM technology is most attractive for stacked last level caches, with significantly lower energy-delay products.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", keywords = "cache; CACTI; commodity DRAM; LLC; logic-process based DRAM; SRAM", } @Article{Vantrease:2008:CSI, author = "Dana Vantrease and Robert Schreiber and Matteo Monchiero and Moray McLaren and Norman P. Jouppi and Marco Fiorentino and Al Davis and Nathan Binkert and Raymond G. Beausoleil and Jung Ho Ahn", title = "{Corona}: System Implications of Emerging Nanophotonic Technology", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "153--164", month = jun, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1394608.1382135", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "We expect that many-core microprocessors will push performance per chip from the 10 gigaflop to the 10 teraflop range in the coming decade. To support this increased performance, memory and inter-core bandwidths will also have to scale by orders of magnitude. Pin limitations, the energy cost of electrical signaling, and the non-scalability of chip-length global wires are significant bandwidth impediments. Recent developments in silicon nanophotonic technology have the potential to meet these off- and on-stack bandwidth requirements at acceptable power levels. Corona is a 3D many-core architecture that uses nanophotonic communication for both inter-core communication and off-stack communication to memory or I/O devices. Its peak floating-point performance is 10 teraflops. Dense wavelength division multiplexed optically connected memory modules provide 10 terabyte per second memory bandwidth. A photonic crossbar fully interconnects its 256 low-power multithreaded cores at 20 terabyte per second bandwidth. We have simulated a 1024 thread Corona system running synthetic benchmarks and scaled versions of the SPLASH-2 benchmark suite. We believe that in comparison with an electrically-connected many-core alternative that uses the same on-stack interconnect power, Corona can provide 2 to 6 times more performance on many memory intensive workloads, while simultaneously reducing power.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", keywords = "3D stacking; many-core CMP; nanophotonics; on-chip Networks", } @TechReport{Volkov:2008:LQC, author = "Vasily Volkov and James W. Demmel", title = "{$ L U $}, {$ Q R $} and {Cholesky} Factorizations using Vector Capabilities of {GPUs}", type = "LAPACK Working Note", number = "202", institution = inst-UCB-EECS, address = inst-UCB-EECS:adr, month = may, year = "2008", bibdate = "Fri Apr 24 12:25:43 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.netlib.org/lapack/lawnspdf/lawn202.pdf", abstract = "We present performance results for dense linear algebra using the 8-series NVIDIA GPUs. Our matrix-matrix multiply routine (GEMM) runs 60\% faster than the vendor implementation in CUBLAS 1.1 and approaches the peak of hardware capabilities. Our LU, QR and Cholesky factorizations achieve up to 80--90\% of the peak GEMM rate. Our parallel LU running on two GPUs achieves up to $ \approx $300 Gflop/s. These results are accomplished by challenging the accepted view of the GPU architecture and programming guidelines. We argue that modern GPUs should be viewed as multithreaded multicore vector units. We exploit blocking similarly to vector computers and heterogeneity of the system by computing both on GPU and CPU. This study includes detailed benchmarking of the GPU memory system that reveals sizes and latencies of caches and TLB. We present a couple of algorithmic optimizations aimed at increasing parallelism and regularity in the problem that provide us with slightly higher performance.", acknowledgement = ack-nhfb, ucbnumber = "UCB/EECS-2008-49", } @Article{Wang:2008:PIM, author = "Kun Wang and Yu Zhang and Huayong Wang and Xiaowei Shen", title = "Parallelization of {IBM Mambo} system simulator in functional modes", journal = j-OPER-SYS-REV, volume = "42", number = "1", pages = "71--76", month = jan, year = "2008", CODEN = "OSRED8", DOI = "https://doi.org/10.1145/1341312.1341325", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Fri Jun 20 17:19:29 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Mambo [4] is IBM's full-system simulator which models PowerPC systems, and provides a complete set of simulation tools to help IBM and its partners in pre-hardware development and performance evaluation for future systems. Currently Mambo simulates target systems on a single host thread. When the number of cores increases in a target system, Mambo's simulation performance for each core goes down. As the so-called `multi-core era' approaches, both target and host systems will have more and more cores. It is very important for Mambo to efficiently simulate a multi-core target system on a multi-core host system. Parallelization is a natural method to speed up Mambo under this situation.\par Parallel Mambo (P-Mambo) is a multi-threaded implementation of Mambo. Mambo's simulation engine is implemented as a user-level thread-scheduler. We propose a multi-scheduler method to adapt Mambo's simulation engine to multi-threaded execution. Based on this method a core-based module partition is proposed to achieve both high inter-scheduler parallelism and low inter-scheduler dependency. Protection of shared resources is crucial to both correctness and performance of P-Mambo. Since there are two tiers of threads in P-Mambo, protecting shared resources by only OS-level locks possibly introduces deadlocks due to user-level context switch. We propose a new lock mechanism to handle this problem. Since Mambo is an on-going project with many modules currently under development, co-existence with new modules is also important to P-Mambo. We propose a global-lock-based method to guarantee compatibility of P-Mambo with future Mambo modules.\par We have implemented the first version of P-Mambo in functional modes. The performance of P-Mambo has been evaluated on the OpenMP implementation of NAS Parallel Benchmark (NPB) 3.2 [12]. Preliminary experimental results show that P-Mambo achieves an average speedup of 3.4 on a 4-core host machine.", acknowledgement = ack-nhfb, fjournal = "ACM SIGOPS Operating Systems Review", keywords = "architectural simulation; dynamic binary translation; parallel simulation", } @Article{Warg:2008:DTS, author = "Fredrik Warg and Per Stenstrom", title = "Dual-thread Speculation: a Simple Approach to Uncover Thread-level Parallelism on a Simultaneous Multithreaded Processor", journal = j-INT-J-PARALLEL-PROG, volume = "36", number = "2", pages = "166--183", month = apr, year = "2008", CODEN = "IJPPE5", DOI = "https://doi.org/10.1007/s10766-007-0064-z", ISSN = "0885-7458 (print), 1573-7640 (electronic)", ISSN-L = "0885-7458", bibdate = "Wed Jul 9 16:07:03 MDT 2008", bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=36&issue=2; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=36&issue=2&spage=166", acknowledgement = ack-nhfb, fjournal = "International Journal of Parallel Programming", journal-URL = "http://link.springer.com/journal/10766", keywords = "Chip multiprocessors; Computer architecture; Simultaneous multithreading; Thread-level parallelism; Thread-level speculation", } @Book{Weaver:2008:OIO, editor = "David L. Weaver", title = "{OpenSPARC} Internals: {OpenSPARC T1\slash T2} Chip Multithreaded Throughput Computing", publisher = "Lulu, Inc.", address = "860 Aviation Parkway, Suite 300, Morrisville, NC 27560, USA", pages = "xviii + 369", year = "2008", ISBN = "0-557-01974-5", ISBN-13 = "978-0-557-01974-8", LCCN = "????", bibdate = "Tue Nov 11 14:49:47 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/master.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", price = "US\$20.00", URL = "http://www.opensparc.net/publications/books/opensparc-internals.html", acknowledgement = ack-nhfb, libnote = "Not yet in my library.", } @Article{Wegiel:2008:MCVa, author = "Michal Wegiel and Chandra Krintz", title = "The mapping collector: virtual memory support for generational, parallel, and concurrent compaction", journal = j-COMP-ARCH-NEWS, volume = "36", number = "1", pages = "91--102", month = mar, year = "2008", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1353535.1346294", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:51:35 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Parallel and concurrent garbage collectors are increasingly employed by managed runtime environments (MREs) to maintain scalability, as multi-core architectures and multi-threaded applications become pervasive. Moreover, state-of-the-art MREs commonly implement compaction to eliminate heap fragmentation and enable fast linear object allocation.\par Our empirical analysis of object demographics reveals that unreachable objects in the heap tend to form clusters large enough to be effectively managed at the granularity of virtual memory pages. Even though processes can manipulate the mapping of the virtual address space through the standard operating system (OS) interface on most platforms, extant parallel/concurrent compactors do not do so to exploit this clustering behavior and instead achieve compaction by performing, relatively expensive, object moving and pointer adjustment.\par We introduce the Mapping Collector (MC), which leverages virtual memory operations to reclaim and consolidate free space without moving objects and updating pointers. MC is a nearly-single-phase compactor that is simpler and more efficient than previously reported compactors that comprise two to four phases. Through effective MRE-OS coordination, MC maintains the simplicity of a non-moving collector while providing efficient parallel and concurrent compaction.\par We implement both stop-the-world and concurrent MC in a generational garbage collection framework within the open-source HotSpot Java Virtual Machine. Our experimental evaluation using a multiprocessor indicates that MC significantly increases throughput and scalability as well as reduces pause times, relative to state-of-the-art, parallel and concurrent compactors.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", keywords = "compaction; concurrent; parallel; virtual memory", } @Article{Wegiel:2008:MCVb, author = "Michal Wegiel and Chandra Krintz", title = "The {Mapping Collector}: virtual memory support for generational, parallel, and concurrent compaction", journal = j-OPER-SYS-REV, volume = "42", number = "2", pages = "91--102", month = mar, year = "2008", CODEN = "OSRED8", DOI = "https://doi.org/10.1145/1353535.1346294", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Fri Jun 20 17:20:12 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Parallel and concurrent garbage collectors are increasingly employed by managed runtime environments (MREs) to maintain scalability, as multi-core architectures and multi-threaded applications become pervasive. Moreover, state-of-the-art MREs commonly implement compaction to eliminate heap fragmentation and enable fast linear object allocation.\par Our empirical analysis of object demographics reveals that unreachable objects in the heap tend to form clusters large enough to be effectively managed at the granularity of virtual memory pages. Even though processes can manipulate the mapping of the virtual address space through the standard operating system (OS) interface on most platforms, extant parallel/concurrent compactors do not do so to exploit this clustering behavior and instead achieve compaction by performing, relatively expensive, object moving and pointer adjustment.\par We introduce the Mapping Collector (MC), which leverages virtual memory operations to reclaim and consolidate free space without moving objects and updating pointers. MC is a nearly-single-phase compactor that is simpler and more efficient than previously reported compactors that comprise two to four phases. Through effective MRE-OS coordination, MC maintains the simplicity of a non-moving collector while providing efficient parallel and concurrent compaction.\par We implement both stop-the-world and concurrent MC in a generational garbage collection framework within the open-source HotSpot Java Virtual Machine. Our experimental evaluation using a multiprocessor indicates that MC significantly increases throughput and scalability as well as reduces pause times, relative to state-of-the-art, parallel and concurrent compactors.", acknowledgement = ack-nhfb, fjournal = "ACM SIGOPS Operating Systems Review", keywords = "compaction; concurrent; parallel; virtual memory", } @Article{Wegiel:2008:MCVc, author = "Michal Wegiel and Chandra Krintz", title = "The mapping collector: virtual memory support for generational, parallel, and concurrent compaction", journal = j-SIGPLAN, volume = "43", number = "3", pages = "91--102", month = mar, year = "2008", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1353535.1346294", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 11:03:40 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Parallel and concurrent garbage collectors are increasingly employed by managed runtime environments (MREs) to maintain scalability, as multi-core architectures and multi-threaded applications become pervasive. Moreover, state-of-the-art MREs commonly implement compaction to eliminate heap fragmentation and enable fast linear object allocation.\par Our empirical analysis of object demographics reveals that unreachable objects in the heap tend to form clusters large enough to be effectively managed at the granularity of virtual memory pages. Even though processes can manipulate the mapping of the virtual address space through the standard operating system (OS) interface on most platforms, extant parallel/concurrent compactors do not do so to exploit this clustering behavior and instead achieve compaction by performing, relatively expensive, object moving and pointer adjustment.\par We introduce the Mapping Collector (MC), which leverages virtual memory operations to reclaim and consolidate free space without moving objects and updating pointers. MC is a nearly-single-phase compactor that is simpler and more efficient than previously reported compactors that comprise two to four phases. Through effective MRE-OS coordination, MC maintains the simplicity of a non-moving collector while providing efficient parallel and concurrent compaction.\par We implement both stop-the-world and concurrent MC in a generational garbage collection framework within the open-source HotSpot Java Virtual Machine. Our experimental evaluation using a multiprocessor indicates that MC significantly increases throughput and scalability as well as reduces pause times, relative to state-of-the-art, parallel and concurrent compactors.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "compaction; concurrent; parallel; virtual memory", } @Article{Winter:2008:ATN, author = "Jonathan A. Winter and David H. Albonesi", title = "Addressing thermal nonuniformity in {SMT} workloads", journal = j-TACO, volume = "5", number = "1", pages = "4:1--4:??", month = may, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1369396.1369400", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jun 16 11:41:51 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "We explore DTM techniques within the context of uniform and nonuniform SMT workloads. While DVS is suitable for addressing workloads with uniformly high temperatures, for nonuniform workloads, performance loss occurs because of the slowdown of the cooler thread. To address this, we propose and evaluate DTM mechanisms that exploit the steering-based thread management mechanisms inherent in a clustered SMT architecture. We show that in contrast to DVS, which operates globally, our techniques are more effective at controlling temperature for nonuniform workloads. Furthermore, we devise a DTM technique that combines steering and DVS to achieve consistently good performance across all workloads.", acknowledgement = ack-nhfb, articleno = "4", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924", keywords = "adaptive microarchitectures; clustered microarchitectures; dynamic thermal management; dynamic voltage scaling; simultaneous multithreading", } @Article{Wong:2008:TAF, author = "Chee Siang Wong and Ian Tan and Rosalind Deena Kumari and Fun Wey", title = "Towards achieving fairness in the {Linux} scheduler", journal = j-OPER-SYS-REV, volume = "42", number = "5", pages = "34--43", month = jul, year = "2008", CODEN = "OSRED8", DOI = "https://doi.org/10.1145/1400097.1400102", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Wed Aug 6 16:54:12 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The Operating System scheduler is designed to allocate the CPU resources appropriately to all processes. The Linux Completely Fair Scheduler (CFS) design ensures fairness among tasks using the thread fair scheduling algorithm. This algorithm ensures allocation of resources based on the number of threads in the system and not within executing programs. This can lead to fairness issue in a multi-threaded environment as the Linux scheduler tends to favor programs with higher number of threads. We illustrate the issue of fairness through experimental evaluation thus exposing the weakness of the current allocation scheme where software developers could take advantage by spawning many additional threads in order to obtain more CPU resources. A novel algorithm is proposed as a solution towards achieving better fairness in the Linux scheduler. The algorithm is based on weight readjustment of the threads created in the same process to significantly reduce the unfair allocation of CPU resources in multi-threaded environments. The algorithm was implemented and evaluated. It demonstrated promising results towards solving the raised fairness issue. We conclude this paper highlighting the limitations of the proposed approach and the future work in the stated direction.", acknowledgement = ack-nhfb, fjournal = "ACM SIGOPS Operating Systems Review", keywords = "completely fair scheduler; fairness; Linux; process scheduling", } @Article{Xian:2008:CAS, author = "Feng Xian and Witawas Srisa-an and Hong Jiang", title = "Contention-aware scheduler: unlocking execution parallelism in multithreaded {Java} programs", journal = j-SIGPLAN, volume = "43", number = "10", pages = "163--180", month = sep, year = "2008", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1449955.1449778", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Oct 22 09:57:37 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "In multithreaded programming, locks are frequently used as a mechanism for synchronization. Because today's operating systems do not consider lock usage as a scheduling criterion, scheduling decisions can be unfavorable to multithreaded applications, leading to performance issues such as convoying and heavy lock contention in systems with multiple processors. Previous efforts to address these issues (e.g., transactional memory, lock-free data structure) often treat scheduling decisions as 'a fact of life,' and therefore these solutions try to cope with the consequences of undesirable scheduling instead of dealing with the problem directly.\par In this paper, we introduce {\em Contention-Aware Scheduler (CA-Scheduler)}, which is designed to support efficient execution of large multithreaded Java applications in multiprocessor systems. Our proposed scheduler employs a scheduling policy that reduces lock contention. As will be shown in this paper, our prototype implementation of the CA-Scheduler in Linux and Sun HotSpot virtual machine only incurs 3.5\% runtime overhead, while the overall performance differences, when compared with a system with no contention awareness, range from a degradation of 3\% in a small multithreaded benchmark to an improvement of 15\% in a large Java application server benchmark.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "Java; operating systems; scheduling", } @Article{Ahn:2009:MDE, author = "Jung Ho Ahn and Jacob Leverich and Robert S. Schreiber and Norman P. Jouppi", title = "Multicore {DIMM}: an Energy Efficient Memory Module with Independently Controlled {DRAMs}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "8", number = "1", pages = "5--8", month = jan # "\slash " # jun, year = "2009", CODEN = "????", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Jun 20 17:18:18 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Demand for memory capacity and bandwidth keeps increasing rapidly in modern computer systems, and memory power consumption is becoming a considerable portion of the system power budget. However, the current DDR DIMM standard is not well suited to effectively serve CMP memory requests from both a power and performance perspective. We propose a new memory module called a Multicore DIMM, where DRAM chips are grouped into multiple virtual memory devices, each of which has its own data path and receives separate commands (address and control signals). The Multicore DIMM is designed to improve the energy efficiency of memory systems with small impact on system performance. Dividing each memory modules into 4 virtual memory devices brings a simultaneous 22\%, 7.6\%, and 18\% improvement in memory power, IPC, and system energy-delay product respectively on a set of multithreaded applications and consolidated workloads.", acknowledgement = ack-nhfb, affiliation = "Ahn, JH (Reprint Author), Hewlett Packard Labs, Mississauga, ON, Canada. Ahn, Jung Ho; Schreiber, Robert S.; Jouppi, Norman P., Hewlett Packard Labs, Mississauga, ON, Canada. Leverich, Jacob, Stanford Univ, Stanford, CA 94305 USA.", da = "2019-06-20", doc-delivery-number = "V17GC", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "DRAM; memory module; memory system; Multicore", number-of-cited-references = "16", ORCID-numbers = "Ahn, Jung Ho/0000-0003-1733-1394", research-areas = "Computer Science", researcherid-numbers = "Ahn, Jung Ho/D-1298-2013", times-cited = "26", unique-id = "Ahn:2009:MDE", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Aleen:2009:CAS, author = "Farhana Aleen and Nathan Clark", title = "Commutativity analysis for software parallelization: letting program transformations see the big picture", journal = j-SIGPLAN, volume = "44", number = "3", pages = "241--252", month = mar, year = "2009", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1508284.1508273", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Jun 16 14:39:26 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Extracting performance from many-core architectures requires software engineers to create multi-threaded applications, which significantly complicates the already daunting task of software development. One solution to this problem is automatic compile-time parallelization, which can ease the burden on software developers in many situations. Clearly, automatic parallelization in its present form is not suitable for many application domains and new compiler analyses are needed address its shortcomings.\par In this paper, we present one such analysis: a new approach for detecting commutative functions. Commutative functions are sections of code that can be executed in any order without affecting the outcome of the application, e.g., inserting elements into a set. Previous research on this topic had one significant limitation, in that the results of a commutative functions must produce identical memory layouts. This prevented previous techniques from detecting functions like malloc, which may return different pointers depending on the order in which it is called, but these differing results do not affect the overall output of the application. Our new commutativity analysis correctly identify these situations to better facilitate automatic parallelization. We demonstrate that this analysis can automatically extract significant amounts of parallelism from many applications, and where it is ineffective it can provide software developers a useful list of functions that may be commutative provided semantic program changes that are not automatable.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "automatic software parallelization; commutative functions; random interpretation", } @Article{Amamiya:2009:CBN, author = "Satoshi Amamiya and Makoto Amamiya and Ryuzo Hasegawa and Hiroshi Fujita", title = "A continuation-based noninterruptible multithreading processor architecture", journal = j-J-SUPERCOMPUTING, volume = "47", number = "2", pages = "228--252", month = feb, year = "2009", CODEN = "JOSUED", ISSN = "0920-8542 (print), 1573-0484 (electronic)", ISSN-L = "0920-8542", bibdate = "Wed Aug 25 08:38:29 MDT 2010", bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=47&issue=2; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=47&issue=2&spage=228", acknowledgement = ack-nhfb, fjournal = "The Journal of Supercomputing", journal-URL = "http://link.springer.com/journal/11227", } @Article{Anderson:2009:LAC, author = "Zachary R. Anderson and David Gay and Mayur Naik", title = "Lightweight annotations for controlling sharing in concurrent data structures", journal = j-SIGPLAN, volume = "44", number = "6", pages = "98--109", month = jun, year = "2009", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1542476.1542488", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Jun 16 14:41:16 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "SharC is a recently developed system for checking data-sharing in multithreaded programs. Programmers specify sharing rules (read-only, protected by a lock, etc.) for individual objects, and the SharC compiler enforces these rules using static and dynamic checks. Violations of these rules indicate unintended data sharing, which is the underlying cause of harmful data-races. Additionally, SharC allows programmers to change the sharing rules for a specific object using a {\em sharing cast}, to capture the fact that sharing rules for an object often change during the object's lifetime. SharC was successfully applied to a number of multi-threaded C programs.\par However, many programs are not readily checkable using SharC because their sharing rules, and changes to sharing rules, effectively apply to whole data structures rather than to individual objects. We have developed a system called {\em Shoal\/} to address this shortcoming. In addition to the sharing rules and sharing cast of SharC, our system includes a new concept that we call {\em groups}. A group is a collection of objects all having the same sharing mode. Each group has a distinguished member called the {\em group leader}. When the sharing mode of the group leader changes by way of a sharing cast, the sharing mode of all members of the group also changes. This operation is made sound by maintaining the invariant that at the point of a sharing cast, the only external pointer into the group is the pointer to the group leader. The addition of groups allows checking safe concurrency at the level of data structures rather than at the level of individual objects.\par We demonstrate the necessity and practicality of groups by applying Shoal to a wide range of concurrent C programs (the largest approaching a million lines of code). In all benchmarks groups entail low annotation burden and no significant additional performance overhead.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "concurrent programming; data races; multithreaded programming", } @Article{Antonopoulos:2009:ASH, author = "Christos D. Antonopoulos and Filip Blagojevic and Andrey N. Chernikov and Nikos P. Chrisochoides and Dimitrios S. Nikolopoulos", title = "Algorithm, software, and hardware optimizations for {Delaunay} mesh generation on simultaneous multithreaded architectures", journal = j-J-PAR-DIST-COMP, volume = "69", number = "7", pages = "601--612", month = jul, year = "2009", CODEN = "JPDCER", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Wed Sep 1 16:27:25 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.sciencedirect.com/science/journal/07437315", acknowledgement = ack-nhfb, fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315", } @Article{Azizi:2009:AEC, author = "Omid Azizi and Aqeel Mahesri and Sanjay J. Patel and Mark Horowitz", title = "Area-efficiency in {CMP} core design: co-optimization of microarchitecture and physical design", journal = j-COMP-ARCH-NEWS, volume = "37", number = "2", pages = "56--65", month = may, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1577129.1577138", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:39 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "In this paper, we examine the area-performance design space of a processing core for a chip multiprocessor (CMP), considering both the architectural design space and the tradeoffs of the physical design on which the architecture relies. We first propose a methodology for performing an integrated optimization of both the micro-architecture and the physical circuit design of a microprocessor. In our approach, we use statistical and convex fitting methods to capture a large micro-architectural design space. We then characterize the area-delay tradeoffs of the underlying circuits through RTL synthesis. Finally, we establish the relationship between the architecture and the circuits in an integrative model, which we use to optimize the processor. As a case study, we apply this methodology to explore the performance-area tradeoffs in a highly parallel accelerator architecture for visual computing applications. Based on some early circuit tradeoff data, our results indicate that two separate designs are performance/area optimal for our set of benchmarks: a simpler single-issue, 2-way multithreaded core running at high-frequency, and a more aggressively tuned dual-issue 4-way multithreaded design running at a lower frequency.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Barkstrom:2009:UAS, author = "Bruce R. Barkstrom", title = "On using {Ada} to solve problems in computational economics and related disciplines with concurrent, multiagent algorithms", journal = j-SIGADA-LETTERS, volume = "29", number = "3", pages = "61--72", month = dec, year = "2009", CODEN = "AALEE5", DOI = "https://doi.org/10.1145/1647420.1647437", ISSN = "1094-3641 (print), 1557-9476 (electronic)", ISSN-L = "1094-3641", bibdate = "Mon Jun 21 14:04:37 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Multiagent algorithms are widely used in computational economics and other social sciences to solve theoretical and practical problems. Because such algorithms are inherently concurrent and multithreaded, Ada's constructs for handling communications between concurrent processes and avoiding interference between them make the language very well suited to solving these problems, particularly given developments in multi-core CPU chip-making. This paper provides a concrete example of how Ada assists in solving problems in computational economics and related disciplines that work with multiagent systems. Solving a simple problem illustrates visualizing the agents as Ada tasks, using UML use cases and synchronization diagrams to design the communications patterns between agents, and applying protected objects and functions to avoid computational indeterminacy.", acknowledgement = ack-nhfb, fjournal = "ACM SIGAda Ada Letters", keywords = "computational and mathematical organization theory; computational economics; concurrent programming; multiagent systems; multithreaded programming", } @Article{Barnes:2009:XBA, author = "Christopher Barnes and Pranav Vaidya and Jaehwan John Lee", title = "An {XML}-Based {ADL} Framework for Automatic Generation of Multithreaded Computer Architecture Simulators", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "8", number = "1", pages = "13--16", month = jan # "\slash " # jun, year = "2009", DOI = "https://doi.org/10.1109/L-CA.2009.2", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Jun 20 17:18:18 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Computer architecture simulation has always played a pivotal role in continuous innovation of computers. However, constructing or modifying a high quality simulator is time consuming and error-prone. Thus, often Architecture Description Languages (ADLs) are used to provide an abstraction layer for describing the computer architecture and automatically generating corresponding simulators. Along the line of such research, we present a novel XML-based ADL, its compiler, and a generation methodology to automatically generate multithreaded simulators for computer architecture. We utilize the industry-standard extensible markup language XML to describe the functionality and architecture of a modeled processor. Our ADL framework allows users to easily and quickly modify the structure, register set, and execution of a modeled processor. To prove its validity, we have generated several multithreaded simulators with different configurations based on the MIPS five-stage processor, and successfully tested with two programs.", acknowledgement = ack-nhfb, da = "2019-06-20", doc-delivery-number = "V17GC", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "IUPUI RSFG", funding-text = "This research was funded by the IUPUI RSFG grant.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "C.0.d Modeling of computer architecture; C.1.1.b Pipeline processors", number-of-cited-references = "14", research-areas = "Computer Science", times-cited = "0", unique-id = "Barnes:2009:XBA", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Berger:2009:GSM, author = "Emery D. Berger and Ting Yang and Tongping Liu and Gene Novark", title = "{Grace}: safe multithreaded programming for {C\slash C++}", journal = j-SIGPLAN, volume = "44", number = "10", pages = "81--96", month = oct, year = "2009", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1640089.1640096", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Mon Jun 21 18:01:56 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The shift from single to multiple core architectures means that programmers must write concurrent, multithreaded programs in order to increase application performance. Unfortunately, multithreaded applications are susceptible to numerous errors, including deadlocks, race conditions, atomicity violations, and order violations. These errors are notoriously difficult for programmers to debug.\par This paper presents Grace, a software-only runtime system that eliminates concurrency errors for a class of multithreaded programs: those based on fork-join parallelism. By turning threads into processes, leveraging virtual memory protection, and imposing a sequential commit protocol, Grace provides programmers with the appearance of deterministic, sequential execution, while taking advantage of available processing cores to run code concurrently and efficiently. Experimental results demonstrate Grace's effectiveness: with modest code changes across a suite of computationally-intensive benchmarks (1-16 lines), Grace can achieve high scalability and performance while preventing concurrency errors.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "concurrency; determinism; deterministic concurrency; fork-join; sequential semantics", } @Article{Bocchino:2009:TES, author = "Robert L. {Bocchino, Jr.} and Vikram S. Adve and Danny Dig and Sarita V. Adve and Stephen Heumann and Rakesh Komuravelli and Jeffrey Overbey and Patrick Simmons and Hyojin Sung and Mohsen Vakilian", title = "A type and effect system for deterministic parallel {Java}", journal = j-SIGPLAN, volume = "44", number = "10", pages = "97--116", month = oct, year = "2009", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1639949.1640097", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Mon Jun 21 18:01:56 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Today's shared-memory parallel programming models are complex and error-prone. While many parallel programs are intended to be deterministic, unanticipated thread interleavings can lead to subtle bugs and nondeterministic semantics. In this paper, we demonstrate that a practical {\em type and effect system\/} can simplify parallel programming by {\em guaranteeing deterministic semantics\/} with modular, compile-time type checking even in a rich, concurrent object-oriented language such as Java. We describe an object-oriented type and effect system that provides several new capabilities over previous systems for expressing deterministic parallel algorithms. We also describe a language called Deterministic Parallel Java (DPJ) that incorporates the new type system features, and we show that a core subset of DPJ is sound. We describe an experimental validation showing that DPJ can express a wide range of realistic parallel programs; that the new type system features are useful for such programs; and that the parallel programs exhibit good performance gains (coming close to or beating equivalent, nondeterministic multithreaded programs where those are available).", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "commutativity; determinism; deterministic parallelism; effect systems; effects", } @Article{Bratanov:2009:VMW, author = "Stanislav Bratanov and Roman Belenov and Nikita Manovich", title = "Virtual machines: a whole new world for performance analysis", journal = j-OPER-SYS-REV, volume = "43", number = "2", pages = "46--55", month = apr, year = "2009", CODEN = "OSRED8", DOI = "https://doi.org/10.1145/1531793.1531802", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Thu Apr 23 19:43:22 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This article addresses a problem of performance monitoring inside virtual machines (VMs). It advocates focused monitoring of particular virtualized programs, explains the need for and the importance of such an approach to performance monitoring in virtualized execution environments, and emphasizes its benefits for virtual machine manufacturers, virtual machine users (mostly, software developers) and hardware (processor) manufacturers. The article defines the problem of in-VM performance monitoring as the ability to employ modern methods and hardware performance monitoring capabilities inside virtual machines to an extent comparable with what is being done in real environments. Unfortunately, there are numerous reasons preventing us from achieving such an ambitious goal, one of those reasons being the lack of support from virtualization engines; that is why a novel method of 'cooperative' performance data collection is disclosed. The method implies collection of performance data at physical hardware and simultaneous tracking of software states inside a virtual machine. Each statistically visible execution point of the virtualized software may then be associated with information on real hardware events. The method effectively enables time-based sampling of virtualized workloads combined with hardware event counting, is applicable to unmodified, commercially available virtual machines, and has competitive precision and overhead. The practical significance and value of the method are further illustrated by studying a parallel workload and uncovering virtualization-specific performance issues of multithreaded programs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGOPS Operating Systems Review", keywords = "hardware performance event counters; virtual machines", } @Article{Choi:2009:HCS, author = "Seungryul Choi and Donald Yeung", title = "Hill-climbing {SMT} processor resource distribution", journal = j-TOCS, volume = "27", number = "1", pages = "1:1--1:??", month = feb, year = "2009", CODEN = "ACSYEC", ISSN = "0734-2071 (print), 1557-7333 (electronic)", ISSN-L = "0734-2071", bibdate = "Fri Feb 13 18:30:25 MST 2009", bibsource = "http://www.acm.org/pubs/contents/journals/tocs/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/tocs.bib", abstract = "The key to high performance in Simultaneous MultiThreaded (SMT) processors lies in optimizing the distribution of shared resources to active threads. Existing resource distribution techniques optimize performance only indirectly. They infer potential performance bottlenecks by observing indicators, like instruction occupancy or cache miss counts, and take actions to try to alleviate them. While the corrective actions are designed to improve performance, their actual performance impact is not known since end performance is never monitored. Consequently, potential performance gains are lost whenever the corrective actions do not effectively address the actual bottlenecks occurring in the pipeline.\par We propose a different approach to SMT resource distribution that optimizes end performance directly. Our approach observes the impact that resource distribution decisions have on performance at runtime, and feeds this information back to the resource distribution mechanisms to improve future decisions. By evaluating many different resource distributions, our approach tries to learn the best distribution over time. Because we perform learning online, learning time is crucial. We develop a hill-climbing algorithm that quickly learns the best distribution of resources by following the performance gradient within the resource distribution space. We also develop several ideal learning algorithms to enable deeper insights through limit studies.\par This article conducts an in-depth investigation of hill-climbing SMT resource distribution using a comprehensive suite of 63 multiprogrammed workloads. Our results show hill-climbing outperforms ICOUNT, FLUSH, and DCRA (three existing SMT techniques) by 11.4\%, 11.5\%, and 2.8\%, respectively, under the weighted IPC metric. A limit study conducted using our ideal learning algorithms shows our approach can potentially outperform the same techniques by 19.2\%, 18.0\%, and 7.6\%, respectively, thus demonstrating additional room exists for further improvement. Using our ideal algorithms, we also identify three bottlenecks that limit online learning speed: local maxima, phased behavior, and interepoch jitter. We define metrics to quantify these learning bottlenecks, and characterize the extent to which they occur in our workloads. Finally, we conduct a sensitivity study, and investigate several extensions to improve our hill-climbing technique.", acknowledgement = ack-nhfb, articleno = "1", fjournal = "ACM Transactions on Computer Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J774", } @Book{Cormen:2009:IA, editor = "Thomas H. Cormen and Charles Eric Leiserson and Ronald L. Rivest and Clifford Stein", title = "Introduction to algorithms", publisher = pub-MIT, address = pub-MIT:adr, edition = "Third", pages = "xix + 1292", year = "2009", ISBN = "0-262-03384-4 (hardcover), 0-262-53305-7 (paperback)", ISBN-13 = "978-0-262-03384-8 (hardcover), 978-0-262-53305-8 (paperback)", LCCN = "QA76.6 .C662 2009", bibdate = "Thu Sep 9 14:42:33 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; z3950.loc.gov:7090/Voyager", abstract = "Some books on algorithms are rigorous but incomplete; others cover masses of material but lack rigor. Introduction to Algorithms uniquely combines rigor and comprehensiveness. The book covers a broad range of algorithms in depth, yet makes their design and analysis accessible to all levels of readers. Each chapter is relatively self-contained and can be used as a unit of study. The algorithms are described in English and in a pseudocode designed to be readable by anyone who has done a little programming. The explanations have been kept elementary without sacrificing depth of coverage or mathematical rigor. The first edition became a widely used text in universities worldwide as well as the standard reference for professionals. The second edition featured new chapters on the role of algorithms, probabilistic analysis and randomized algorithms, and linear programming. The third edition has been revised and updated throughout. It includes two completely new chapters, on van Emde Boas trees and multithreaded algorithms, and substantial additions to the chapter on recurrences (now called ``Divide-and-Conquer''). It features improved treatment of dynamic programming and greedy algorithms and a new notion of edge-based flow in the material on flow networks. Many new exercises and problems have been added for this edition.", acknowledgement = ack-nhfb, libnote = "Not in my library.", subject = "Computer programming; Computer algorithms", } @Article{Daniluk:2009:MTS, author = "Andrzej Daniluk", title = "Multithreaded transactions in scientific computing. {The} {Growth06\_v2} program", journal = j-COMP-PHYS-COMM, volume = "180", number = "7", pages = "1219--1220", month = jul, year = "2009", CODEN = "CPHCBZ", DOI = "https://doi.org/10.1016/j.cpc.2009.01.024", ISSN = "0010-4655 (print), 1879-2944 (electronic)", ISSN-L = "0010-4655", bibdate = "Mon Feb 13 23:42:43 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/compphyscomm2000.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.sciencedirect.com/science/article/pii/S0010465509000393", acknowledgement = ack-nhfb, fjournal = "Computer Physics Communications", journal-URL = "http://www.sciencedirect.com/science/journal/00104655", } @Article{deBoer:2009:SVC, author = "F. S. de Boer", title = "A shared-variable concurrency analysis of multi-threaded object-oriented programs", journal = j-THEOR-COMP-SCI, volume = "410", number = "2--3", pages = "128--141", day = "6", month = feb, year = "2009", CODEN = "TCSCDI", ISSN = "0304-3975 (print), 1879-2294 (electronic)", ISSN-L = "0304-3975", bibdate = "Mon Mar 28 21:21:46 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.sciencedirect.com/science/journal/03043975", acknowledgement = ack-nhfb, fjournal = "Theoretical Computer Science", journal-URL = "http://www.sciencedirect.com/science/journal/03043975", } @Article{Desai:2009:AIC, author = "Aniruddha Desai and Jugdutt Singh", title = "Architecture Independent Characterization of Embedded {Java} Workloads", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "8", number = "1", pages = "29--32", month = jan # "\slash " # jun, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2009.7", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/java2000.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", abstract = "This paper presents architecture independent characterization of embedded Java workloads based on the industry standard GrinderBench benchmark which includes different classes of real world embedded Java applications. This work is based on a custom built embedded Java Virtual Machine (JVM) simulator specifically designed for embedded JVM modeling and embodies domain specific details such as thread scheduling, algorithms used for native CLDC APIs and runtime data structures optimized for use in embedded systems. The results presented include dynamic execution characteristics, dynamic bytecode instruction mix, application and API workload distribution, Object allocation statistics, instruction-set coverage, memory usage statistics and method code and stack frame characteristics.", acknowledgement = ack-nhfb, affiliation = "Desai, A (Reprint Author), La Trobe Univ, Bundoora, Vic 3086, Australia. Desai, Aniruddha; Singh, Jugdutt, La Trobe Univ, Bundoora, Vic 3086, Australia.", author-email = "desai@ieee.org", da = "2019-06-20", doc-delivery-number = "V17GC", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Algorithm design and analysis; application program interfaces; architecture independent characterization; CLDC API; custom built embedded Java virtual machine simulator; data structures; Data structures; Design optimization; dynamic bytecode instruction mix; dynamic execution characteristics; embedded Java workload; Embedded Systems; embedded systems; Embedded Systems; industry standard GrinderBench benchmark; instruction sets; instruction-set coverage; Java; Java bytecode; Job shop scheduling; JVM; memory usage statistics; method code characteristics; multi-threading; object allocation statistics; Runtime; runtime data structure; scheduling; Scheduling algorithm; stack frame characteristics; Statistical distributions; storage allocation; thread scheduling; virtual machines; Virtual machining; Workload Characterization", number-of-cited-references = "8", research-areas = "Computer Science", times-cited = "0", unique-id = "Desai:2009:AIC", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Devietti:2009:DDS, author = "Joseph Devietti and Brandon Lucia and Luis Ceze and Mark Oskin", title = "{DMP}: deterministic shared memory multiprocessing", journal = j-SIGPLAN, volume = "44", number = "3", pages = "85--96", month = mar, year = "2009", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1508244.1508255", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Jun 16 14:39:26 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Current shared memory multicore and multiprocessor systems are nondeterministic. Each time these systems execute a multithreaded application, even if supplied with the same input, they can produce a different output. This frustrates debugging and limits the ability to properly test multithreaded code, becoming a major stumbling block to the much-needed widespread adoption of parallel programming.\par In this paper we make the case for fully deterministic shared memory multiprocessing (DMP). The behavior of an arbitrary multithreaded program on a DMP system is only a function of its inputs. The core idea is to make inter-thread communication fully deterministic. Previous approaches to coping with nondeterminism in multithreaded programs have focused on replay, a technique useful only for debugging. In contrast, while DMP systems are directly useful for debugging by offering repeatability by default, we argue that parallel programs should execute deterministically in the field as well. This has the potential to make testing more assuring and increase the reliability of deployed multithreaded software. We propose a range of approaches to enforcing determinism and discuss their implementation trade-offs. We show that determinism can be provided with little performance cost using our architecture proposals on future hardware, and that software-only approaches can be utilized on existing systems.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "debugging; determinism; multicores; parallel programming", } @Article{Eyerman:2009:MLP, author = "Stijn Eyerman and Lieven Eeckhout", title = "Memory-level parallelism aware fetch policies for simultaneous multithreading processors", journal = j-TACO, volume = "6", number = "1", pages = "3:1--3:??", month = mar, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1509864.1509867", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu May 7 14:55:25 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "A thread executing on a simultaneous multithreading (SMT) processor that experiences a long-latency load will eventually stall while holding execution resources. Existing long-latency load aware SMT fetch policies limit the amount of resources allocated by a stalled thread by identifying long-latency loads and preventing the thread from fetching more instructions --- and in some implementations, instructions beyond the long-latency load are flushed to release allocated resources.\par This article proposes an SMT fetch policy that takes into account the available memory-level parallelism (MLP) in a thread. The key idea proposed in this article is that in case of an isolated long-latency load (i.e., there is no MLP), the thread should be prevented from allocating additional resources. However, in case multiple independent long-latency loads overlap (i.e., there is MLP), the thread should allocate as many resources as needed in order to fully expose the available MLP. MLP-aware fetch policies achieve better performance for MLP-intensive threads on SMT processors, leading to higher overall system throughput and shorter average turnaround time than previously proposed fetch policies.", acknowledgement = ack-nhfb, articleno = "3", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924", keywords = "Fetch Policy; Memory-Level Parallelism (MLP); Simultaneous Multithreading (SMT)", } @Article{Eyerman:2009:PTC, author = "Stijn Eyerman and Lieven Eeckhout", title = "Per-thread cycle accounting in {SMT} processors", journal = j-SIGPLAN, volume = "44", number = "3", pages = "133--144", month = mar, year = "2009", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1508284.1508260", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Jun 16 14:39:26 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This paper proposes a cycle accounting architecture for Simultaneous Multithreading (SMT) processors that estimates the execution times for each of the threads had they been executed alone, while they are running simultaneously on the SMT processor. This is done by accounting each cycle to either a base, miss event or waiting cycle component during multi-threaded execution. Single-threaded alone execution time is then estimated as the sum of the base and miss event components; the waiting cycle component represents the lost cycle count due to SMT execution. The cycle accounting architecture incurs reasonable hardware cost (around 1KB of storage) and estimates single-threaded performance with average prediction errors around 7.2\% for two-program workloads and 11.7\% for four-program workloads.\par The cycle accounting architecture has several important applications to system software and its interaction with SMT hardware. For one, the estimated single-thread alone execution time provides an accurate picture to system software of the actually consumed processor cycles per thread. The alone execution time instead of the total execution time (timeslice) may make system software scheduling policies more effective. Second, a new class of thread-progress aware SMT fetch policies based on per-thread progress indicators enable system software level priorities to be enforced at the hardware level.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "cycle accounting; simultaneous multithreading (SMT); thread-progress aware fetch policy", } @Article{Flanagan:2009:FEP, author = "Cormac Flanagan and Stephen N. Freund", title = "{FastTrack}: efficient and precise dynamic race detection", journal = j-SIGPLAN, volume = "44", number = "6", pages = "121--133", month = jun, year = "2009", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1542476.1542490", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Jun 16 14:41:16 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/java2000.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2000.bib", abstract = "Multithreaded programs are notoriously prone to race conditions. Prior work on dynamic race detectors includes fast but imprecise race detectors that report false alarms, as well as slow but precise race detectors that never report false alarms. The latter typically use expensive vector clock operations that require time linear in the number of program threads.\par This paper exploits the insight that the full generality of vector clocks is unnecessary in most cases. That is, we can replace heavyweight vector clocks with an adaptive lightweight representation that, for almost all operations of the target program, requires only constant space and supports constant-time operations. This representation change significantly improves time and space performance, with no loss in precision.\par Experimental results on Java benchmarks including the Eclipse development environment show that our FastTrack race detector is an order of magnitude faster than a traditional vector-clock race detector, and roughly twice as fast as the high-performance DJIT+ algorithm. FastTrack is even comparable in speed to Eraser on our Java benchmarks, while never reporting false alarms.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "concurrency; dynamic analysis; race conditions", } @Article{Fung:2009:DWF, author = "Wilson W. L. Fung and Ivan Sham and George Yuan and Tor M. Aamodt", title = "Dynamic warp formation: {Efficient MIMD} control flow on {SIMD} graphics hardware", journal = j-TACO, volume = "6", number = "2", pages = "7:1--7:??", month = jun, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1543753.1543756", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jul 2 12:32:04 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Recent advances in graphics processing units (GPUs) have resulted in massively parallel hardware that is easily programmable and widely available in today's desktop and notebook computer systems. GPUs typically use single-instruction, multiple-data (SIMD) pipelines to achieve high performance with minimal overhead for control hardware. Scalar threads running the same computing kernel are grouped together into SIMD batches, sometimes referred to as warps. While SIMD is ideally suited for simple programs, recent GPUs include control flow instructions in the GPU instruction set architecture and programs using these instructions may experience reduced performance due to the way branch execution is supported in hardware. One solution is to add a stack to allow different SIMD processing elements to execute distinct program paths after a branch instruction. The occurrence of diverging branch outcomes for different processing elements significantly degrades performance using this approach. In this article, we propose dynamic warp formation and scheduling, a mechanism for more efficient SIMD branch execution on GPUs. It dynamically regroups threads into new warps on the fly following the occurrence of diverging branch outcomes. We show that a realistic hardware implementation of this mechanism improves performance by 13\%, on average, with 256 threads per core, 24\% with 512 threads, and 47\% with 768 threads for an estimated area increase of 8\%.", acknowledgement = ack-nhfb, articleno = "7", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924", keywords = "control flow; fine-grained multithreading; GPU; SIMD", } @Article{Gabor:2009:SLA, author = "Ron Gabor and Avi Mendelson and Shlomo Weiss", title = "Service level agreement for multithreaded processors", journal = j-TACO, volume = "6", number = "2", pages = "6:1--6:??", month = jun, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1543753.1543755", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jul 2 12:32:04 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Multithreading is widely used to increase processor throughput. As the number of shared resources increase, managing them while guaranteeing predicted performance becomes a major problem. Attempts have been made in previous work to ease this via different fairness mechanisms. In this article, we present a new approach to control the resource allocation and sharing via a service level agreement (SLA)-based mechanism; that is, via an agreement in which multithreaded processors guarantee a minimal level of service to the running threads. We introduce a new metric, {\em C\/}$_{SLA}$, for conformance to SLA in multithreaded processors and show that controlling resources using with SLA allows for higher gains than are achievable by previously suggested fairness techniques. It also permits improving one metric (e.g., power) while maintaining SLA in another (e.g., performance). We compare SLA enforcement to schemes based on other fairness metrics, which are mostly targeted at equalizing execution parameters. We show that using SLA rather than fairness based algorithms provides a range of acceptable execution points from which we can select the point that best fits our optimization target, such as maximizing the weighted speedup (sum of the speedups of the individual threads) or reducing power. We demonstrate the effectiveness of the new SLA approach using switch-on-event (coarse-grained) multithreading. Our weighted speedup improvement scheme successfully enforces SLA while improving the weighted speedup by an average of 10\% for unbalanced threads. This result is significant when compared with performance losses that may be incurred by fairness enforcement methods. When optimizing for power reduction in unbalanced threads SLA enforcement reduces the power by an average of 15\%. SLA may be complemented by other power reduction methods to achieve further power savings {\em and\/} maintain the same service level for the threads. We also demonstrate differentiated SLA, where weighted speedup is maximized while each thread may have a different throughput constraint.", acknowledgement = ack-nhfb, articleno = "6", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924", keywords = "fairness; performance; power; Service level agreement; throughput", } @Article{Ganty:2009:VLA, author = "Pierre Ganty and Rupak Majumdar and Andrey Rybalchenko", title = "Verifying liveness for asynchronous programs", journal = j-SIGPLAN, volume = "44", number = "1", pages = "102--113", month = jan, year = "2009", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1594834.1480895", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Fri Oct 9 08:40:38 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Asynchronous or 'event-driven' programming is a popular technique to efficiently and flexibly manage concurrent interactions. In these programs, the programmer can post tasks that get stored in a task buffer and get executed atomically by a non-preemptive scheduler at a future point. We give a decision procedure for the fair termination property of asynchronous programs. The fair termination problem asks, given an asynchronous program and a fairness condition on its executions, does the program always terminate on fair executions? The fairness assumptions rule out certain undesired bad behaviors, such as where the scheduler ignores a set of posted tasks forever, or where a non-deterministic branch is always chosen in one direction. Since every liveness property reduces to a fair termination property, our decision procedure extends to liveness properties of asynchronous programs. Our decision procedure for the fair termination of asynchronous programs assumes all variables are finite-state. Even though variables are finite-state, asynchronous programs can have an unbounded stack from recursive calls made by tasks, as well as an unbounded task buffer of pending tasks. We show a reduction from the fair termination problem for asynchronous programs to fair termination problems on Petri Nets, and our main technical result is a reduction of the latter problem to Presburger satisfiability. Our decidability result is in contrast to multithreaded recursive programs, for which liveness properties are undecidable. While we focus on fair termination, we show our reduction to Petri Nets can be used to prove related properties such as fair nonstarvation (every posted task is eventually executed) and safety properties such as boundedness (find a bound on the maximum number of posted tasks that can be in the task buffer at any point).", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "asynchronous (event-driven) programming; fair termination; liveness; Petri nets", } @TechReport{Granat:2009:NPQ, author = "Robert Granat and Bo K{\aa}gstr{\"o}m and Daniel Kressner", title = "A novel parallel {$ Q R $} algorithm for hybrid distributed memory {HPC} systems", type = "LAPACK Working Note", number = "216", institution = "Department of Computing Science and HPC2N", address = "Ume{\aa} University, S-901 Ume{\aa}, Sweden", month = apr, year = "2009", bibdate = "Fri Apr 24 12:25:43 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.netlib.org/lapack/lawnspdf/lawn216.pdf", abstract = "A novel variant of the parallel QR algorithm for solving dense nonsymmetric eigenvalue problems on hybrid distributed high performance computing (HPC) systems is presented. For this purpose, we introduce the concept of multi-window bulge chain chasing and parallelize aggressive early deflation. The multi-window approach ensures that most computations when chasing chains of bulges are performed in level 3 BLAS operations, while the aim of aggressive early deflation is to speed up the convergence of the QR algorithm. Mixed MPI-OpenMP coding techniques are utilized for porting the codes to distributed memory platforms with multithreaded nodes, such as multicore processors. Numerous numerical experiments confirm the superior performance of our parallel QR algorithm in comparison with the existing ScaLAPACK code, leading to an implementation that is one to two orders of magnitude faster for sufficiently large problems, including a number of examples from applications.", acknowledgement = ack-nhfb, keywords = "aggressive early deflation; bulge chasing; Eigenvalue problem; hybrid distributed memory systems.; level 3 performance; multishift; nonsymmetric QR algorithm; parallel algorithms; parallel computations", utknumber = "UMINF-09.06", } @Article{Grant:2009:IEE, author = "Ryan E. Grant and Ahmad Afsahi", title = "Improving energy efficiency of asymmetric chip multithreaded multiprocessors through reduced {OS} noise scheduling", journal = j-CCPE, volume = "21", number = "18", pages = "2355--2376", day = "25", month = dec, year = "2009", CODEN = "CCPEBO", DOI = "https://doi.org/10.1002/cpe.1454", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Mon Dec 5 10:08:40 MST 2011", bibsource = "http://www.interscience.wiley.com/jpages/1532-0626; https://www.math.utah.edu/pub/tex/bib/ccpe.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Concurrency and Computation: Prac\-tice and Experience", journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626", onlinedate = "8 Jul 2009", } @Article{Guz:2009:MCV, author = "Zvika Guz and Evgeny Bolotin and Idit Keidar and Avinoam Kolodny and Avi Mendelson and Uri C. Weiser", title = "Many-Core vs. Many-Thread Machines: Stay Away From the Valley", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "8", number = "1", pages = "25--28", month = jan # "\slash " # jun, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2009.4", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "We study the tradeoffs between Many-Core machines like Intel's Larrabee and Many-Thread machines like Nvidia and AMD GPGPUs. We define a unified model describing a superposition of the two architectures, and use it to identify operation zones for which each machine is more suitable. Moreover, we identify an intermediate zone in which both machines deliver inferior performance. We study the shape of this ``performance valley'' and provide insights on how it can be avoided.", acknowledgement = ack-nhfb, affiliation = "Guz, Z (Reprint Author), Technion Israel Inst Technol, EE Dept, IL-32000 Haifa, Israel. Guz, Zvika; Keidar, Idit; Kolodny, Avinoam; Weiser, Uri C., Technion Israel Inst Technol, EE Dept, IL-32000 Haifa, Israel. Bolotin, Evgeny, Intel Corp, Santa Clara, CA 95051 USA. Mendelson, Avi, Microsoft Corp, Redmond, WA 98052 USA.", author-email = "zguz@tx.technion.ac.il evgeny.bolotin@intel.com idish@ee.technion.ac.il kolodny@ee.technion.ac.il avim@microsoft.com uri.weiser@ee.technion.ac.il", da = "2019-06-20", doc-delivery-number = "V17GC", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Semiconductors Research Corporation (SRC); Intel; Israeli Ministry of Science Knowledge Center on Chip MultiProcessors", funding-text = "We thank Ronny Ronen, Michael Behar, and Roni Rosner. This work was partially supported by Semiconductors Research Corporation (SRC), Intel, and the Israeli Ministry of Science Knowledge Center on Chip MultiProcessors.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "AMD GPGPU; architecture superposition; Bandwidth; Chip Multiprocessors; Computer Systems; coprocessors; Delay; Engines; Equations; GPGPU; Graphics; Intelpsilas Larrabee; many-core machines; many-thread machines; Multi-core/single-chip multiprocessors; multi-threading; multiprocessing systems; Nvidia GPGPU; Parallel Architectures; parallel architectures; Parallel processing; performance valley; Processor Architectures; Shape", number-of-cited-references = "9", research-areas = "Computer Science", times-cited = "27", unique-id = "Guz:2009:MCV", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Hoffman:2009:SAT, author = "Kevin J. Hoffman and Patrick Eugster and Suresh Jagannathan", title = "Semantics-aware trace analysis", journal = j-SIGPLAN, volume = "44", number = "6", pages = "453--464", month = jun, year = "2009", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1542476.1542527", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Jun 16 14:41:16 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "As computer systems continue to become more powerful and complex, so do programs. High-level abstractions introduced to deal with complexity in large programs, while simplifying human reasoning, can often obfuscate salient program properties gleaned from automated source-level analysis through subtle (often non-local) interactions. Consequently, understanding the effects of program changes and whether these changes violate intended protocols become difficult to infer. Refactorings, and feature additions, modifications, or removals can introduce hard-to-catch bugs that often go undetected until many revisions later.\par To address these issues, this paper presents a novel dynamic program analysis that builds a {\em semantic view\/} of program executions. These views reflect program abstractions and aspects; however, views are not simply projections of execution traces, but are linked to each other to capture semantic interactions among abstractions at different levels of granularity in a scalable manner.\par We describe our approach in the context of Java and demonstrate its utility to improve {\em regression analysis}. We first formalize a subset of Java and a grammar for traces generated at program execution. We then introduce several types of views used to analyze regression bugs along with a novel, scalable technique for semantic differencing of traces from different versions of the same program. Benchmark results on large open-source Java programs demonstrate that semantic-aware trace differencing can identify precise and useful details about the underlying cause for a regression, even in programs that use reflection, multithreading, or dynamic code generation, features that typically confound other analysis techniques.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "automated debugging; debugging aids; semantic tracing; testing tools; trace views; tracing", } @Article{Joshi:2009:RDP, author = "Pallavi Joshi and Chang-Seo Park and Koushik Sen and Mayur Naik", title = "A randomized dynamic program analysis technique for detecting real deadlocks", journal = j-SIGPLAN, volume = "44", number = "6", pages = "110--120", month = jun, year = "2009", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1543135.1542489", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Jun 16 14:41:16 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "We present a novel dynamic analysis technique that finds real deadlocks in multi-threaded programs. Our technique runs in two stages. In the first stage, we use an imprecise dynamic analysis technique to find potential deadlocks in a multi-threaded program by observing an execution of the program. In the second stage, we control a random thread scheduler to create the potential deadlocks with high probability. Unlike other dynamic analysis techniques, our approach has the advantage that it does not give any false warnings. We have implemented the technique in a prototype tool for Java, and have experimented on a number of large multi-threaded Java programs. We report a number of previously known and unknown real deadlocks that were found in these benchmarks.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "active testing; concurrency; deadlock detection; dynamic program analysis", } @Article{Kejariwal:2009:ELL, author = "Arun Kejariwal and Alexander V. Veidenbaum and Alexandru Nicolau and Milind Girkar and Xinmin Tian and Hideki Saito", title = "On the exploitation of loop-level parallelism in embedded applications", journal = j-TECS, volume = "8", number = "2", pages = "10:1--10:??", month = jan, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1457255.1457257", ISSN = "1539-9087 (print), 1558-3465 (electronic)", ISSN-L = "1539-9087", bibdate = "Thu Feb 5 19:15:05 MST 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Advances in the silicon technology have enabled increasing support for hardware parallelism in embedded processors. Vector units, multiple processors/cores, multithreading, special-purpose accelerators such as DSPs or cryptographic engines, or a combination of the above have appeared in a number of processors. They serve to address the increasing performance requirements of modern embedded applications. To what extent the available hardware parallelism can be exploited is directly dependent on the amount of parallelism inherent in the given application and the congruence between the granularity of hardware and application parallelism. This paper discusses how loop-level parallelism in embedded applications can be exploited in hardware and software. Specifically, it evaluates the efficacy of automatic loop parallelization and the performance potential of different types of parallelism, viz., true thread-level parallelism (TLP), speculative thread-level parallelism and vector parallelism, when executing loops. Additionally, it discusses the interaction between parallelization and vectorization. Applications from both the industry-standard EEMBC{\reg},$^1$ 1.1, EEMBC 2.0 and the academic MiBench embedded benchmark suites are analyzed using the Intel{\reg}$^2$ C compiler. The results show the performance that can be achieved today on real hardware and using a production compiler, provide upper bounds on the performance potential of the different types of thread-level parallelism, and point out a number of issues that need to be addressed to improve performance. The latter include parallelization of libraries such as libc and design of parallel algorithms to allow maximal exploitation of parallelism. The results also point to the need for developing new benchmark suites more suitable to parallel compilation and execution.\par $^1$ Other names and brands may be claimed as the property of others.\par $^2$ Intel is a trademark of Intel Corporation or its subsidiaries in the United States and other countries.", acknowledgement = ack-nhfb, articleno = "10", fjournal = "ACM Transactions on Embedded Computing Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J840", keywords = "libraries; Multi-cores; multithreading; parallel loops; programming models; system-on-chip (Soc); thread-level speculation; vectorization", } @Article{Kejariwal:2009:PSA, author = "Arun Kejariwal and Calin Cas{\c{c}}aval", title = "Parallelization spectroscopy: analysis of thread-level parallelism in {HPC} programs", journal = j-SIGPLAN, volume = "44", number = "4", pages = "293--294", month = apr, year = "2009", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1594835.1504221", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Fri Oct 9 08:40:49 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "In this paper, we present a method --- parallelization spectroscopy --- for analyzing the thread-level parallelism available in production High Performance Computing (HPC) codes. We survey a number of techniques that are commonly used for parallelization and classify all the loops in the case study presented using a sensitivity metric: how likely is a particular technique is successful in parallelizing the loop.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "loop transformations; multithreading; parallelism", } @Article{Kunal:2009:HDS, author = "K. Kunal and K. George and M. Gautam and V. Kamakoti", title = "{HTM} design spaces: complete decoupling from caches and achieving highly concurrent transactions", journal = j-OPER-SYS-REV, volume = "43", number = "2", pages = "98--99", month = apr, year = "2009", CODEN = "OSRED8", DOI = "https://doi.org/10.1145/1531793.1531809", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Thu Apr 23 19:43:22 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This paper proposes a Hardware Transactional Memory (HTM) design for multi-core environments. Using a novel technique to keep track of transactional read-write entries, the design provides a holistic and scalable solution to Transactional Memory (TM) implementation issues of context switching, process migration and overflow handling. Another aspect of the design is that it allows transactions to run in a highly concurrent manner by using special techniques to handle conflict resolution, conflict detection and overflows. The feasibility and validity of the proposed design are demonstrated by developing a synthesizable Hardware Description Language (HDL) model of the design and also experimenting on the same with standard benchmarks.", acknowledgement = ack-nhfb, fjournal = "ACM SIGOPS Operating Systems Review", keywords = "context switching; hardware transactional memory; multi-threaded cores; operating systems; overflow handling; process migration", } @TechReport{Kurzak:2009:SLA, author = "Jakub Kurzak and Hatem Ltaief and Jack Dongarra and Rosa M. Badia", title = "Scheduling Linear Algebra Operations on Multicore Processors", type = "LAPACK Working Note", number = "213", institution = inst-UT-CS, address = inst-UT-CS:adr, month = feb, year = "2009", bibdate = "Fri Apr 24 12:25:43 2009", bibsource = "https://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.netlib.org/lapack/lawnspdf/lawn213.pdf", abstract = "We present performance results for dense linear algebra using the 8-series NVIDIA GPUs. Our matrix-matrix multiply routine (GEMM) runs 60\% faster than the vendor implementation in CUBLAS 1.1 and approaches the peak of hardware capabilities. Our LU, QR and Cholesky factorizations achieve up to 80--90\% of the peak GEMM rate. Our parallel LU running on two GPUs achieves up to $ \approx $300 Gflop/s. These results are accomplished by challenging the accepted view of the GPU architecture and programming guidelines. We argue that modern GPUs should be viewed as multithreaded multicore vector units. We exploit blocking similarly to vector computers and heterogeneity of the system by computing both on GPU and CPU. This study includes detailed benchmarking of the GPU memory system that reveals sizes and latencies of caches and TLB. We present a couple of algorithmic optimizations aimed at increasing parallelism and regularity in the problem that provide us with slightly higher performance.", acknowledgement = ack-nhfb, keywords = "Cholesky; factorization; linear algebra; LU; multicore; QR; scheduling; task graph", utknumber = "UT-CS-09-636", } @Article{Lee:2009:MHF, author = "Taehee Lee and Tobias H{\"o}llerer", title = "Multithreaded Hybrid Feature Tracking for Markerless Augmented Reality", journal = j-IEEE-TRANS-VIS-COMPUT-GRAPH, volume = "15", number = "3", pages = "355--368", month = may # "\slash " # jun, year = "2009", CODEN = "ITVGEA", DOI = "https://doi.org/10.1109/TVCG.2008.190", ISSN = "1077-2626 (print), 1941-0506 (electronic), 2160-9306", ISSN-L = "1077-2626", bibdate = "Thu Jul 2 10:22:33 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetransviscomputgraph.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Visualization and Computer Graphics", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2945", } @Article{Lenharth:2009:RDO, author = "Andrew Lenharth and Vikram S. Adve and Samuel T. King", title = "Recovery domains: an organizing principle for recoverable operating systems", journal = j-SIGPLAN, volume = "44", number = "3", pages = "49--60", month = mar, year = "2009", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1508284.1508251", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Jun 16 14:39:26 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "We describe a strategy for enabling existing commodity operating systems to recover from unexpected run-time errors in nearly any part of the kernel, including core kernel components. Our approach is dynamic and request-oriented; it isolates the effects of a fault to the requests that caused the fault rather than to static kernel components. This approach is based on a notion of 'recovery domains,' an organizing principle to enable rollback of state affected by a request in a multithreaded system with minimal impact on other requests or threads. We have applied this approach on v2.4.22 and v2.6.27 of the Linux kernel and it required 132 lines of changed or new code: the other changes are all performed by a simple instrumentation pass of a compiler. Our experiments show that the approach is able to recover from otherwise fatal faults with minimal collateral impact during a recovery event.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "akeso; automatic fault recovery; recovery domains", } @Article{Lubbers:2009:RMP, author = "Enno L{\"u}bbers and Marco Platzner", title = "{ReconOS}: {Multithreaded} programming for reconfigurable computers", journal = j-TECS, volume = "9", number = "1", pages = "8:1--8:??", month = oct, year = "2009", CODEN = "????", ISSN = "1539-9087 (print), 1558-3465 (electronic)", ISSN-L = "1539-9087", bibdate = "Mon Mar 15 18:40:57 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/tecs.bib", acknowledgement = ack-nhfb, articleno = "8", fjournal = "ACM Transactions on Embedded Computing Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J840", } @Article{Madriles:2009:BST, author = "Carlos Madriles and Pedro L{\'o}pez and Josep M. Codina and Enric Gibert and Fernando Latorre and Alejandro Martinez and Ra{\'u}l Martinez and Antonio Gonzalez", title = "Boosting single-thread performance in multi-core systems through fine-grain multi-threading", journal = j-COMP-ARCH-NEWS, volume = "37", number = "3", pages = "474--483", month = jun, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1555754.1555813", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:55 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Industry has shifted towards multi-core designs as we have hit the memory and power walls. However, single thread performance remains of paramount importance since some applications have limited thread-level parallelism (TLP), and even a small part with limited TLP impose important constraints to the global performance, as explained by Amdahl's law.\par In this paper we propose a novel approach for leveraging multiple cores to improve single-thread performance in a multi-core design. The proposed technique features a set of novel hardware mechanisms that support the execution of threads generated at compile time. These threads result from a fine-grain speculative decomposition of the original application and they are executed under a modified multi-core system that includes: (1) mechanisms to support multiple versions; (2) mechanisms to detect violations among threads; (3) mechanisms to reconstruct the original sequential order; and (4) mechanisms to checkpoint the architectural state and recovery to handle misspeculations.\par The proposed scheme outperforms previous hardware-only schemes to implement the idea of combining cores for executing single-thread applications in a multi-core design by more than 10\% on average on Spec2006 for all configurations. Moreover, single-thread performance is improved by 41\% on average when the proposed scheme is used on a Tiny Core, and up to 2.6x for some selected applications.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", keywords = "automatic parallelization; core-fusion; multicore; single-thread performance; speculative multithreading; thread-level parallelism", } @Article{Marino:2009:LES, author = "Daniel Marino and Madanlal Musuvathi and Satish Narayanasamy", title = "{LiteRace}: effective sampling for lightweight data-race detection", journal = j-SIGPLAN, volume = "44", number = "6", pages = "134--143", month = jun, year = "2009", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1542476.1542491", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Jun 16 14:41:16 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Data races are one of the most common and subtle causes of pernicious concurrency bugs. Static techniques for preventing data races are overly conservative and do not scale well to large programs. Past research has produced several dynamic data race detectors that can be applied to large programs. They are precise in the sense that they only report actual data races. However, dynamic data race detectors incur a high performance overhead, slowing down a program's execution by an order of magnitude.\par In this paper we present LiteRace, a very lightweight data race detector that samples and analyzes only selected portions of a program's execution. We show that it is possible to sample a multithreaded program at a low frequency, and yet, find infrequently occurring data races. We implemented LiteRace using Microsoft's Phoenix compiler. Our experiments with several Microsoft programs, Apache, and Firefox show that LiteRace is able to find more than 70\% of data races by sampling less than 2\% of memory accesses in a given program execution.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "concurrency bugs; dynamic data race detection; sampling", } @Article{Monchiero:2009:HSC, author = "Matteo Monchiero and Jung Ho Ahn and Ayose Falc{\'o}n and Daniel Ortega and Paolo Faraboschi", title = "How to simulate 1000 cores", journal = j-COMP-ARCH-NEWS, volume = "37", number = "2", pages = "10--19", month = may, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1577129.1577133", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:39 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This paper proposes a novel methodology to efficiently simulate shared-memory multiprocessors composed of hundreds of cores. The basic idea is to use thread-level parallelism in the software system and translate it into core-level parallelism in the simulated world. To achieve this, we first augment an existing full-system simulator to identify and separate the instruction streams belonging to the different software threads. Then, the simulator dynamically maps each instruction flow to the corresponding core of the target multi-core architecture, taking into account the inherent thread synchronization of the running applications. Our simulator allows a user to execute any multithreaded application in a conventional full-system simulator and evaluate the performance of the application on a many-core hardware. We carried out extensive simulations on the SPLASH-2 benchmark suite and demonstrated the scalability up to 1024 cores with limited simulation speed degradation vs. the single-core case on a fixed workload. The results also show that the proposed technique captures the intrinsic behavior of the SPLASH-2 suite, even when we scale up the number of shared-memory cores beyond the thousand-core limit.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Mukherjee:2009:PAS, author = "Jayanta Mukherjee and Soumyendu Raha", title = "Power-aware Speed-up for Multithreaded Numerical Linear Algebraic Solvers on Chip Multicore Processors", journal = j-SCPE, volume = "10", number = "2", pages = "217--228", month = jun, year = "2009", CODEN = "????", ISSN = "1895-1767", bibdate = "Thu Sep 2 11:55:11 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.scpe.org/content/10/2.toc", URL = "http://www.scpe.org/vols/vol10/no2/SCPE_10_2_07.pdf; http://www.scpe.org/vols/vol10/no2/SCPE_10_2_07.zip", acknowledgement = ack-nhfb, } @Article{Musoll:2009:LSO, author = "Enric Musoll", title = "Leakage-saving opportunities in mesh-based massive multi-core architectures", journal = j-COMP-ARCH-NEWS, volume = "37", number = "5", pages = "1--7", month = dec, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1755235.1755237", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Apr 8 18:42:25 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "When processing multi-threaded workloads requiring significant inter-thread communication, opportunities to reduce power consumption arise due to the large latencies in obtaining data from the threads running on remote cores and the lack of architectural resources implemented in the simple cores to cover these latencies.\par In this work we propose to use the drowsy mode technique to save leakage power on the cores and leverage the mesh-based communication fabric to hide the wake-up latency of the core blocks. We have observed a potential for reducing the overall power of around 70\% in a generic homogeneous 256-core tile-based multi-core architecture.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Neamtiu:2009:STU, author = "Iulian Neamtiu and Michael Hicks", title = "Safe and timely updates to multi-threaded programs", journal = j-SIGPLAN, volume = "44", number = "6", pages = "13--24", month = jun, year = "2009", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1543135.1542479", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Jun 16 14:41:16 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Many dynamic updating systems have been developed that enable a program to be patched while it runs, to fix bugs or add new features. This paper explores techniques for supporting dynamic updates to multi-threaded programs, focusing on the problem of applying an update in a timely fashion while still producing correct behavior. Past work has shown that this tension of {\em safety\/} versus timeliness can be balanced for single-threaded programs. For multi-threaded programs, the task is more difficult because myriad thread interactions complicate understanding the possible program states to which a patch could be applied. Our approach allows the programmer to specify a few program points (e.g., one per thread) at which a patch may be applied, which simplifies reasoning about safety. To improve timeliness, a combination of static analysis and run-time support automatically expands these few points to many more that produce behavior equivalent to the originals. Experiments with thirteen realistic updates to three multi-threaded servers show that we can safely perform a dynamic update within milliseconds when more straightforward alternatives would delay some updates indefinitely.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "dynamic software updating; multi-threading; update safety; update timeliness", } @Article{Nicolau:2009:TEP, author = "Alexandru Nicolau and Guangqiang Li and Arun Kejariwal", title = "Techniques for efficient placement of synchronization primitives", journal = j-SIGPLAN, volume = "44", number = "4", pages = "199--208", month = apr, year = "2009", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1504176.1504207", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Fri Oct 9 08:40:49 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Harnessing the hardware parallelism of the emerging multi-cores systems necessitates concurrent software. Unfortunately, most of the existing mainstream software is sequential in nature. Although one could auto-parallelize a given program, the efficacy of this is largely limited to floating-point codes. One of the ways to alleviate the above limitation is to parallelize programs, which cannot be auto-parallelized, via explicit synchronization. In this regard, efficient placement of the synchronization primitives --- say, post, wait --- plays a key role in achieving high degree of thread-level parallelism ({\em TLP\/}). In this paper, we propose novel compiler techniques for the above. Specifically, given a control flow graph ({\em CFG\/}), the proposed techniques place a post as early as possible and place a wait as late as possible in the CFG, subject to dependences. We demonstrate the efficacy of our techniques, on a real machine, using real codes, specifically, from the industry-standard SPEC CPU benchmarks, the Linux kernel and other widely used open source codes. Our results show that the proposed techniques yield significantly higher levels of TLP than the state-of-the-art.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "compilers; multithreading; parallelization; performance", } @Article{Olszewski:2009:KED, author = "Marek Olszewski and Jason Ansel and Saman Amarasinghe", title = "{Kendo}: efficient deterministic multithreading in software", journal = j-SIGPLAN, volume = "44", number = "3", pages = "97--108", month = mar, year = "2009", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1508244.1508256", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Jun 16 14:39:26 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Although chip-multiprocessors have become the industry standard, developing parallel applications that target them remains a daunting task. Non-determinism, inherent in threaded applications, causes significant challenges for parallel programmers by hindering their ability to create parallel applications with repeatable results. As a consequence, parallel applications are significantly harder to debug, test, and maintain than sequential programs.\par This paper introduces Kendo: a new software-only system that provides deterministic multithreading of parallel applications. Kendo enforces a deterministic interleaving of lock acquisitions and specially declared non-protected reads through a novel dynamically load-balanced deterministic scheduling algorithm. The algorithm tracks the progress of each thread using performance counters to construct a deterministic logical time that is used to compute an interleaving of shared data accesses that is both deterministic and provides good load balancing. Kendo can run on today's commodity hardware while incurring only a modest performance cost. Experimental results on the SPLASH-2 applications yield a geometric mean overhead of only 16\% when running on 4 processors. This low overhead makes it possible to benefit from Kendo even after an application is deployed. Programmers can start using Kendo today to program parallel applications that are easier to develop, debug, and test.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "debugging; determinism; deterministic multithreading; multicore; parallel programming", } @Article{Pichel:2009:IDR, author = "J. C. Pichel and D. B. Heras and J. C. Cabaleiro and F. F. Rivera", title = "Increasing data reuse of sparse algebra codes on simultaneous multithreading architectures", journal = j-CCPE, volume = "21", number = "15", pages = "1838--1856", month = oct, year = "2009", CODEN = "CCPEBO", DOI = "https://doi.org/10.1002/cpe.1404", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Mon Dec 5 10:08:38 MST 2011", bibsource = "http://www.interscience.wiley.com/jpages/1532-0626; https://www.math.utah.edu/pub/tex/bib/ccpe.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Concurrency and Computation: Prac\-tice and Experience", journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626", onlinedate = "11 Feb 2009", } @Article{Piringer:2009:MTA, author = "Harald Piringer and Christian Tominski and Philipp Muigg and Wolfgang Berger", title = "A Multi-Threading Architecture to Support Interactive Visual Exploration", journal = j-IEEE-TRANS-VIS-COMPUT-GRAPH, volume = "15", number = "6", pages = "1113--1120", month = nov # "\slash " # dec, year = "2009", CODEN = "ITVGEA", DOI = "https://doi.org/10.1109/TVCG.2009.110", ISSN = "1077-2626 (print), 1941-0506 (electronic), 2160-9306", ISSN-L = "1077-2626", bibdate = "Thu May 13 17:38:49 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetransviscomputgraph.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Visualization and Computer Graphics", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2945", } @Article{Quintana-Orti:2009:PMA, author = "Gregorio Quintana-Ort{\'\i} and Enrique S. Quintana-Ort{\'\i} and Robert A. {Van De Geijn} and Field G. {Van Zee} and Ernie Chan", title = "Programming matrix algorithms-by-blocks for thread-level parallelism", journal = j-TOMS, volume = "36", number = "3", pages = "14:1--14:26", month = jul, year = "2009", CODEN = "ACMSCU", DOI = "https://doi.org/10.1145/1527286.1527288", ISSN = "0098-3500 (print), 1557-7295 (electronic)", ISSN-L = "0098-3500", bibdate = "Tue Jul 21 14:09:07 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "With the emergence of thread-level parallelism as the primary means for continued performance improvement, the programmability issue has reemerged as an obstacle to the use of architectural advances. We argue that evolving legacy libraries for dense and banded linear algebra is not a viable solution due to constraints imposed by early design decisions. We propose a philosophy of abstraction and separation of concerns that provides a promising solution in this problem domain. The first abstraction, FLASH, allows algorithms to express computation with matrices consisting of contiguous blocks, facilitating algorithms-by-blocks. Operand descriptions are registered for a particular operation a priori by the library implementor. A runtime system, SuperMatrix, uses this information to identify data dependencies between suboperations, allowing them to be scheduled to threads out-of-order and executed in parallel. But not all classical algorithms in linear algebra lend themselves to conversion to algorithms-by-blocks. We show how our recently proposed LU factorization with incremental pivoting and a closely related algorithm-by-blocks for the QR factorization, both originally designed for out-of-core computation, overcome this difficulty. Anecdotal evidence regarding the development of routines with a core functionality demonstrates how the methodology supports high productivity while experimental results suggest that high performance is abundantly achievable.", acknowledgement = ack-nhfb, articleno = "14", fjournal = "ACM Transactions on Mathematical Software (TOMS)", journal-URL = "http://dl.acm.org/pub.cfm?id=J782", keywords = "high-performance; libraries; Linear algebra; multithreaded architectures", } @Article{Raghavan:2009:DLC, author = "P. Raghavan and A. Lambrechts and M. Jayapala and F. Catthoor and D. Verkest", title = "Distributed Loop Controller for Multithreading in Unithreaded {ILP} Architectures", journal = j-IEEE-TRANS-COMPUT, volume = "58", number = "3", pages = "311--321", month = mar, year = "2009", CODEN = "ITCOB4", DOI = "https://doi.org/10.1109/TC.2008.168", ISSN = "0018-9340 (print), 1557-9956 (electronic)", ISSN-L = "0018-9340", bibdate = "Mon Jul 4 11:37:40 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2000.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=4624249", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Computers", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12", } @Article{Ratanaworabhan:2009:DTA, author = "Paruj Ratanaworabhan and Martin Burtscher and Darko Kirovski and Benjamin Zorn and Rahul Nagpal and Karthik Pattabiraman", title = "Detecting and tolerating asymmetric races", journal = j-SIGPLAN, volume = "44", number = "4", pages = "173--184", month = apr, year = "2009", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1504176.1504202", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Fri Oct 9 08:40:49 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This paper introduces ToleRace, a runtime system that allows programs to detect and even tolerate asymmetric data races. Asymmetric races are race conditions where one thread correctly acquires and releases a lock for a shared variable while another thread improperly accesses the same variable. ToleRace provides approximate isolation in the critical sections of lock-based parallel programs by creating a local copy of each shared variable when entering a critical section, operating on the local copies, and propagating the appropriate copies upon leaving the critical section. We start by characterizing all possible interleavings that can cause races and precisely describe the effect of ToleRace in each case. Then, we study the theoretical aspects of an oracle that knows exactly what type of interleaving has occurred. Finally, we present two software implementations of ToleRace and evaluate them on multithreaded applications from the SPLASH2 and PARSEC suites. Our implementation on top of a dynamic instrumentation tool, which works directly on executables and requires no source code modifications, incurs an overhead of a factor of two on average. Manually adding ToleRace to the source code of these applications results in an average overhead of 6.4 percent.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "dynamic instrumentation; race detection and toleration; runtime support", } @Article{Riccobene:2009:SCB, author = "Elvinia Riccobene and Patrizia Scandurra and Sara Bocchio and Alberto Rosti and Luigi Lavazza and Luigi Mantellini", title = "{SystemC\slash C-based} model-driven design for embedded systems", journal = j-TECS, volume = "8", number = "4", pages = "30:1--30:??", month = jul, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1550987.1550993", ISSN = "1539-9087 (print), 1558-3465 (electronic)", ISSN-L = "1539-9087", bibdate = "Thu Jul 23 12:32:49 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This article summarizes our effort, since 2004 up to the present time, for improving the current industrial Systems-on-Chip and Embedded Systems design by joining the capabilities of the unified modeling language (UML) and SystemC/C programming languages to operate at system-level. The proposed approach exploits the OMG model-driven architecture --- a framework for Model-driven Engineering --- capabilities of reducing abstract, coarse-grained and platform-independent system models to fine-grained and platform-specific models. We first defined a design methodology and a development flow for the hardware, based on a SystemC UML profile and encompassing different levels of abstraction. We then included a multithread C UML profile for modelling software applications. Both SystemC/C profiles are consistent sets of modelling constructs designed to lift the programming features (both structural and behavioral) of the two coding languages to the UML modeling level. The new codesign flow is supported by an environment, which allows system modeling at higher abstraction levels (from a functional executable level to a register transfer level) and supports automatic code-generation/back-annotation from/to UML models.", acknowledgement = ack-nhfb, articleno = "30", fjournal = "ACM Transactions on Embedded Computing Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J840", keywords = "C; ES; MDE; SoC; SystemC; UML", } @Article{Roy:2009:LPF, author = "Indrajit Roy and Donald E. Porter and Michael D. Bond and Kathryn S. McKinley and Emmett Witchel", title = "{Laminar}: practical fine-grained decentralized information flow control", journal = j-SIGPLAN, volume = "44", number = "6", pages = "63--74", month = jun, year = "2009", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1543135.1542484", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Jun 16 14:41:16 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Decentralized information flow control (DIFC) is a promising model for writing programs with powerful, end-to-end security guarantees. Current DIFC systems that run on commodity hardware can be broadly categorized into two types: language-level and operating system-level DIFC. Language level solutions provide no guarantees against security violations on system resources, like files and sockets. Operating system solutions can mediate accesses to system resources, but are inefficient at monitoring the flow of information through fine-grained program data structures.\par This paper describes Laminar, the first system to implement decentralized information flow control using a single set of abstractions for OS resources and heap-allocated objects. Programmers express security policies by labeling data with secrecy and integrity labels, and then access the labeled data in lexically scoped security regions. Laminar enforces the security policies specified by the labels at runtime. Laminar is implemented using a modified Java virtual machine and a new Linux security module. This paper shows that security regions ease incremental deployment and limit dynamic security checks, allowing us to retrofit DIFC policies on four application case studies. Replacing the applications' ad-hoc security policies changes less than 10\% of the code, and incurs performance overheads from 1\% to 56\%. Whereas prior DIFC systems only support limited types of multithreaded programs, Laminar supports a more general class of multithreaded DIFC programs that can access heterogeneously labeled data.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "information flow control; java virtual machine; operating systems; security region", } @Article{Sidiroglou:2009:AAS, author = "Stelios Sidiroglou and Oren Laadan and Carlos Perez and Nicolas Viennot and Jason Nieh and Angelos D. Keromytis", title = "{ASSURE}: automatic software self-healing using rescue points", journal = j-SIGPLAN, volume = "44", number = "3", pages = "37--48", month = mar, year = "2009", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1508284.1508250", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Jun 16 14:39:26 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Software failures in server applications are a significant problem for preserving system availability. We present ASSURE, a system that introduces rescue points that recover software from unknown faults while maintaining both system integrity and availability, by mimicking system behavior under known error conditions. Rescue points are locations in existing application code for handling a given set of programmer-anticipated failures, which are automatically repurposed and tested for safely enabling fault recovery from a larger class of (unanticipated) faults. When a fault occurs at an arbitrary location in the program, ASSURE restores execution to an appropriate rescue point and induces the program to recover execution by virtualizing the program's existing error-handling facilities. Rescue points are identified using fuzzing, implemented using a fast coordinated checkpoint-restart mechanism that handles multi-process and multi-threaded applications, and, after testing, are injected into production code using binary patching. We have implemented an ASSURE Linux prototype that operates without application source code and without base operating system kernel changes. Our experimental results on a set of real-world server applications and bugs show that ASSURE enabled recovery for all of the bugs tested with fast recovery times, has modest performance overhead, and provides automatic self-healing orders of magnitude faster than current human-driven patch deployment methods.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "binary patching; checkpoint restart; error recovery; reliable software; software self-healing", } @Article{Son:2009:CDD, author = "Seung Woo Son and Mahmut Kandemir and Mustafa Karakoy and Dhruva Chakrabarti", title = "A compiler-directed data prefetching scheme for chip multiprocessors", journal = j-SIGPLAN, volume = "44", number = "4", pages = "209--218", month = apr, year = "2009", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1504176.1504208", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Fri Oct 9 08:40:49 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Data prefetching has been widely used in the past as a technique for hiding memory access latencies. However, data prefetching in multi-threaded applications running on chip multiprocessors (CMPs) can be problematic when multiple cores compete for a shared on-chip cache (L2 or L3). In this paper, we (i) quantify the impact of conventional data prefetching on shared caches in CMPs. The experimental data collected using multi-threaded applications indicates that, while data prefetching improves performance in small number of cores, its benefits reduce significantly as the number of cores is increased, that is, it is not scalable; (ii) identify harmful prefetches as one of the main contributors for degraded performance with a large number of cores; and (iii) propose and evaluate a compiler-directed data prefetching scheme for shared on-chip cache based CMPs. The proposed scheme first identifies program phases using static compiler analysis, and then divides the threads into groups within each phase and assigns a customized prefetcher thread (helper thread) to each group of threads. This helps to reduce the total number of prefetches issued, prefetch overheads, and negative interactions on the shared cache space due to data prefetches, and more importantly, makes compiler-directed prefetching a scalable optimization for CMPs. Our experiments with the applications from the SPEC OMP benchmark suite indicate that the proposed scheme improves overall parallel execution latency by 18.3\% over the no-prefetch case and 6.4\% over the conventional data prefetching scheme (where each core prefetches its data independently), on average, when 12 cores are used. The corresponding average performance improvements with 24 cores are 16.4\% (over the no-prefetch case) and 11.7\% (over the conventional prefetching case). We also demonstrate that the proposed scheme is robust under a wide range of values of our major simulation parameters, and the improvements it achieves come very close to those that can be achieved using an optimal scheme.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "chip multiprocessors; compiler; helper thread; prefetching", } @Article{Suleman:2009:ACS, author = "M. Aater Suleman and Onur Mutlu and Moinuddin K. Qureshi and Yale N. Patt", title = "Accelerating critical section execution with asymmetric multi-core architectures", journal = j-SIGPLAN, volume = "44", number = "3", pages = "253--264", month = mar, year = "2009", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1508244.1508274", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Jun 16 14:39:26 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "To improve the performance of a single application on Chip Multiprocessors (CMPs), the application must be split into threads which execute concurrently on multiple cores. In multi-threaded applications, critical sections are used to ensure that only one thread accesses shared data at any given time. Critical sections can serialize the execution of threads, which significantly reduces performance and scalability.\par This paper proposes Accelerated Critical Sections (ACS), a technique that leverages the high-performance core(s) of an Asymmetric Chip Multiprocessor (ACMP) to accelerate the execution of critical sections. In ACS, selected critical sections are executed by a high-performance core, which can execute the critical section faster than the other, smaller cores. As a result, ACS reduces serialization: it lowers the likelihood of threads waiting for a critical section to finish. Our evaluation on a set of 12 critical-section-intensive workloads shows that ACS reduces the average execution time by 34\% compared to an equal-area 32T-core symmetric CMP and by 23\% compared to an equal-area ACMP. Moreover, for 7 out of the 12 workloads, ACS improves scalability by increasing the number of threads at which performance saturates.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "cmp; critical sections; heterogeneous cores; locks; multi-core; parallel programming", } @Book{Swinnen:2009:APA, author = "G{\'e}rard Swinnen", title = "Apprendre {\'a} programmer avec Python: objet, multithreading, {\'e}v{\'e}nements, bases de donn{\'e}es, programmation web, programmation r{\'e}seau, Unicode", publisher = pub-EYROLLES, address = pub-EYROLLES:adr, pages = "xviii + 341", year = "2009", LCCN = "????", bibdate = "Thu Apr 16 12:00:29 MDT 2009", bibsource = "carmin.sudoc.abes.fr:210/ABES-Z39-PUBLIC; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, language = "French", } @Article{Tallent:2009:EPM, author = "Nathan R. Tallent and John M. Mellor-Crummey", title = "Effective performance measurement and analysis of multithreaded applications", journal = j-SIGPLAN, volume = "44", number = "4", pages = "229--240", month = apr, year = "2009", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1504176.1504210", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Fri Oct 9 08:40:49 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Understanding why the performance of a multithreaded program does not improve linearly with the number of cores in a shared-memory node populated with one or more multicore processors is a problem of growing practical importance. This paper makes three contributions to performance analysis of multithreaded programs. First, we describe how to measure and attribute {\em parallel idleness}, namely, where threads are stalled and unable to work. This technique applies broadly to programming models ranging from explicit threading ({\em e.g.}, Pthreads) to higher-level models such as Cilk and OpenMP. Second, we describe how to measure and attribute {\em parallel overhead\/} -- when a thread is performing miscellaneous work other than executing the user's computation. By employing a combination of compiler support and post-mortem analysis, we incur no measurement cost beyond normal profiling to glean this information. Using {\em idleness\/} and {\em overhead\/} metrics enables one to pinpoint areas of an application where concurrency should be increased (to reduce idleness), decreased (to reduce overhead), or where the present parallelization is hopeless (where idleness and overhead are both high). Third, we describe how to measure and attribute arbitrary performance metrics for high-level multithreaded programming models, such as Cilk. This requires bridging the gap between the expression of logical concurrency in programs and its realization at run-time as it is adaptively partitioned and scheduled onto a pool of threads. We have prototyped these ideas in the context of Rice University's HPCToolkit performance tools. We describe our approach, implementation, and experiences applying this approach to measure and attribute work, idleness, and overhead in executions of Cilk programs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "call path profiling; hpctoolkit; multithreaded programming models; performance analysis", } @Article{Thakur:2009:TSE, author = "Rajeev Thakur and William Gropp", title = "Test suite for evaluating performance of multithreaded {MPI} communication", journal = j-PARALLEL-COMPUTING, volume = "35", number = "12", pages = "608--617", month = dec, year = "2009", CODEN = "PACOEJ", ISSN = "0167-8191 (print), 1872-7336 (electronic)", ISSN-L = "0167-8191", bibdate = "Thu Sep 2 17:51:11 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.sciencedirect.com/science/journal/01678191", acknowledgement = ack-nhfb, fjournal = "Parallel Computing", journal-URL = "http://www.sciencedirect.com/science/journal/01678191", } @Article{Vander-Swalmen:2009:CAM, author = "Pascal Vander-Swalmen and Gilles Dequen and Micha{\"e}l Krajecki", title = "A Collaborative Approach for Multi-Threaded {SAT} Solving", journal = j-INT-J-PARALLEL-PROG, volume = "37", number = "3", pages = "324--342", month = jun, year = "2009", CODEN = "IJPPE5", ISSN = "0885-7458 (print), 1573-7640 (electronic)", ISSN-L = "0885-7458", bibdate = "Wed Sep 1 16:06:47 MDT 2010", bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=37&issue=3; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=37&issue=3&spage=324", acknowledgement = ack-nhfb, fjournal = "International Journal of Parallel Programming", journal-URL = "http://link.springer.com/journal/10766", } @Article{Vera:2009:SRL, author = "Xavier Vera and Jaume Abella and Javier Carretero and Antonio Gonz{\'a}lez", title = "Selective replication: a lightweight technique for soft errors", journal = j-TOCS, volume = "27", number = "4", pages = "8:1--8:30", month = dec, year = "2009", CODEN = "ACSYEC", DOI = "https://doi.org/10.1145/1658357.1658359", ISSN = "0734-2071 (print), 1557-7333 (electronic)", ISSN-L = "0734-2071", bibdate = "Mon Mar 15 09:06:46 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tocs/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Soft errors are an important challenge in contemporary microprocessors. Modern processors have caches and large memory arrays protected by parity or error detection and correction codes. However, today's failure rate is dominated by flip flops, latches, and the increasing sensitivity of combinational logic to particle strikes. Moreover, as Chip Multi-Processors (CMPs) become ubiquitous, meeting the FIT budget for new designs is becoming a major challenge.\par Solutions based on replicating threads have been explored deeply; however, their high cost in performance and energy make them unsuitable for current designs. Moreover, our studies based on a typical configuration for a modern processor show that focusing on the top 5 most vulnerable structures can provide up to 70\% reduction in FIT rate. Therefore, full replication may overprotect the chip by reducing the FIT much below budget.\par We propose {\em Selective Replication}, a lightweight-reconfigurable mechanism that achieves a high FIT reduction by protecting the most vulnerable instructions with minimal performance and energy impact. Low performance degradation is achieved by not requiring additional issue slots and reissuing instructions only during the time window between when they are retirable and they actually retire. Coverage can be reconfigured online by replicating only a subset of the instructions (the most vulnerable ones). Instructions' vulnerability is estimated based on the area they occupy and the time they spend in the issue queue. By changing the vulnerability threshold, we can adjust the trade-off between coverage and performance loss.\par Results for an out-of-order processor configured similarly to Intel{\reg} Core\TM{} Micro-Architecture show that our scheme can achieve over 65\% FIT reduction with less than 4\% performance degradation with small area and complexity overhead.", acknowledgement = ack-nhfb, articleno = "8", fjournal = "ACM Transactions on Computer Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J774", keywords = "AVF prediction; FIT reduction; redundant multithreading; Soft errors", } @Article{Wang:2009:TDA, author = "Yin Wang and St{\'e}phane Lafortune and Terence Kelly and Manjunath Kudlur and Scott Mahlke", title = "The theory of deadlock avoidance via discrete control", journal = j-SIGPLAN, volume = "44", number = "1", pages = "252--263", month = jan, year = "2009", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1480881.1480913", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Fri Oct 9 08:40:38 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Deadlock in multithreaded programs is an increasingly important problem as ubiquitous multicore architectures force parallelization upon an ever wider range of software. This paper presents a theoretical foundation for dynamic deadlock avoidance in concurrent programs that employ conventional mutual exclusion and synchronization primitives (e.g., multithreaded C/Pthreads programs). Beginning with control flow graphs extracted from program source code, we construct a formal model of the program and then apply Discrete Control Theory to automatically synthesize deadlock-avoidance control logic that is implemented by program instrumentation. At run time, the control logic avoids deadlocks by postponing lock acquisitions. Discrete Control Theory guarantees that the program instrumented with our synthesized control logic cannot deadlock. Our method furthermore guarantees that the control logic is maximally permissive: it postpones lock acquisitions only when necessary to prevent deadlocks, and therefore permits maximal runtime concurrency. Our prototype for C/Pthreads scales to real software including Apache, OpenLDAP, and two kinds of benchmarks, automatically avoiding both injected and naturally occurring deadlocks while imposing modest runtime overheads.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "concurrent programming; discrete control theory; dynamic deadlock avoidance; multicore processors; multithreaded programming; parallel programming", } @Article{Youseff:2009:PES, author = "Lamia Youseff and Keith Seymour and Haihang You and Dmitrii Zagorodnov and Jack Dongarra and Rich Wolski", title = "Paravirtualization effect on single-and multi-threaded memory-intensive linear algebra software", journal = "The Journal of Networks, Software Tools, and Cluster Computing", volume = "12", number = "2", pages = "101--122", month = "????", year = "2009", DOI = "https://doi.org/10.1007/s10586-009-0080-4", ISSN = "1386-7857", bibdate = "Tue Jun 4 08:20:03 MDT 2013", bibsource = "https://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Yu:2009:CIC, author = "Jie Yu and Satish Narayanasamy", title = "A case for an interleaving constrained shared-memory multi-processor", journal = j-COMP-ARCH-NEWS, volume = "37", number = "3", pages = "325--336", month = jun, year = "2009", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1555815.1555796", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:55 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Shared-memory multi-threaded programming is inherently more difficult than single-threaded programming. The main source of complexity is that, the threads of an application can interleave in so many different ways. To ensure correctness, a programmer has to test all possible thread interleavings, which, however, is impractical.\par Many rare thread interleavings remain untested in production systems, and they are the root cause for a majority of concurrency bugs. We propose a shared-memory multi-processor design that avoids untested interleavings to improve the correctness of a multi-threaded program. Since untested interleavings tend to occur infrequently at runtime, the performance cost of avoiding them is not high.\par We propose to encode the set of tested correct interleavings in a program's binary executable using {\em Predecessor Set (PSet)\/} constraints. These constraints are efficiently enforced at runtime using processor support, which ensures that the runtime follows a tested interleaving. We analyze several bugs in open source applications such as MySQL, Apache, Mozilla, etc., and show that, by enforcing PSet constraints, we can avoid not only data races and atomicity violations, but also other forms of concurrency bugs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", keywords = "concurrency bugs; multiprocessors; parallel programming; software reliability", } @Article{Ziarek:2009:SWB, author = "Lukasz Ziarek and Suresh Jagannathan and Matthew Fluet and Umut A. Acar", title = "Speculative {$N$}-way barriers (abstract only)", journal = j-SIGPLAN, volume = "44", number = "5", pages = "8--8", month = may, year = "2009", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1629635.1629637", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Mon Jun 21 18:01:41 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Speculative execution is an important technique that has historically been used to extract concurrency from sequential programs. While techniques to support speculation work well when computations perform relatively simple actions (e.g., reads and writes to known locations), understanding speculation for multi-threaded programs in which threads may communicate and synchronize through multiple shared references is significantly more challenging, and is the focus of this paper.\par We use as our reference point a simple higher-order concurrent language extended with an n-way barrier and a fork/join execution model. Our technique permits the expression guarded by the barrier to speculatively proceed before the barrier has been satisfied (i.e., before all threads that synchronize on that barrier have done so) and to have participating threads that would normally block on the barrier to speculatively proceed as well. Our solution formulates safety properties under which speculation is correct in a fork/join model, and per-synchronization basis.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Agarwal:2010:DDP, author = "R. Agarwal and S. Bensalem and E. Farchi and K. Havelund and Y. Nir-Buchbinder and S. Stoller and S. Ur and L. Wang", title = "Detection of deadlock potentials in multithreaded programs", journal = j-IBM-JRD, volume = "54", number = "5", pages = "3:1--3:15", month = "????", year = "2010", CODEN = "IBMJAE", DOI = "https://doi.org/10.1147/JRD.2010.2060276", ISSN = "0018-8646 (print), 2151-8556 (electronic)", ISSN-L = "0018-8646", bibdate = "Sun Feb 20 14:29:19 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.research.ibm.com/journal/", acknowledgement = ack-nhfb, fjournal = "IBM Journal of Research and Development", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5288520", } @Article{Agrawal:2010:HLF, author = "Kunal Agrawal and Charles E. Leiserson and Jim Sukha", title = "Helper locks for fork-join parallel programming", journal = j-SIGPLAN, volume = "45", number = "5", pages = "245--256", month = may, year = "2010", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1693453.1693487", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Aug 31 22:39:18 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Helper locks allow programs with large parallel critical sections, called parallel regions, to execute more efficiently by enlisting processors that might otherwise be waiting on the helper lock to aid in the execution of the parallel region. Suppose that a processor {\em p\/} is executing a parallel region {\em A\/} after having acquired the lock {\em L\/} protecting {\em A}. If another processor {\em p\/} $ \prime $ tries to acquire {\em L}, then instead of blocking and waiting for {\em p\/} to complete {\em A}, processor {\em p\/} $ \prime $ joins {\em p\/} to help it complete {\em A}. Additional processors not blocked on {\em L\/} may also help to execute {\em A}.\par The HELPER runtime system can execute fork-join computations augmented with helper locks and parallel regions. HELPER supports the unbounded nesting of parallel regions. We provide theoretical completion-time and space-usage bounds for a design of HELPER based on work stealing. Specifically, let {\em V\/} be the number of parallel regions in a computation, let {\em T\/}$_1$ be its work, and let {\em T\/} $ \infty $ be its 'aggregate span' --- the sum of the spans (critical-path lengths) of all its parallel regions. We prove that HELPER completes the computation in expected time {\em O\/} ({\em T\/}$_1$ / {\em P\/} P + {\em T\/} $ \infty $ + {\em PV\/}) on {\em P\/} processors. This bound indicates that programs with a small number of highly parallel critical sections can attain linear speedup. For the space bound, we prove that HELPER completes a program using only $O(P S_1)$ stack space, where $S_1$ is the sum, over all regions, of the stack space used by each region in a serial execution. Finally, we describe a prototype of HELPER implemented by modifying the Cilk multithreaded runtime system. We used this prototype to implement a concurrent hash table with a resize operation protected by a helper lock.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "Cilk; dynamic multithreading; helper lock; nested parallelism; parallel region; scheduling; work stealing", } @Article{Balaji:2010:FGM, author = "Pavan Balaji and Darius Buntinas and David Goodell and William Gropp and Rajeev Thakur", title = "Fine-Grained Multithreading Support for Hybrid Threaded {MPI} Programming", journal = j-IJHPCA, volume = "24", number = "1", pages = "49--57", month = feb, year = "2010", CODEN = "IHPCFL", DOI = "https://doi.org/10.1177/1094342009360206", ISSN = "1094-3420 (print), 1741-2846 (electronic)", ISSN-L = "1094-3420", bibdate = "Tue Aug 31 09:59:45 MDT 2010", bibsource = "http://hpc.sagepub.com/content/24/1.toc; http://hpc.sagepub.com/content/by/year; https://www.math.utah.edu/pub/tex/bib/ijsa.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/pvm.bib", URL = "http://hpc.sagepub.com/content/24/1/49.full.pdf+html", acknowledgement = ack-nhfb, fjournal = "International Journal of High Performance Computing Applications", journal-URL = "http://hpc.sagepub.com/content/by/year", } @Article{Barthe:2010:SMP, author = "Gilles Barthe and Tamara Rezk and Alejandro Russo and Andrei Sabelfeld", title = "Security of multithreaded programs by compilation", journal = j-TISSEC, volume = "13", number = "3", pages = "21:1--21:??", month = jul, year = "2010", CODEN = "ATISBQ", DOI = "https://doi.org/10.1145/1805974.1895977", ISSN = "1094-9224 (print), 1557-7406 (electronic)", ISSN-L = "1094-9224", bibdate = "Wed Jul 28 14:57:15 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "End-to-End security of mobile code requires that the code neither intentionally nor accidentally propagates sensitive information to an adversary. Although mobile code is commonly multithreaded low-level code, there lack enforcement mechanisms that ensure information security for such programs. The modularity is three-fold: we give modular extensions of sequential semantics, sequential security typing, and sequential security-type preserving compilation that allow us enforcing security for multithreaded programs. Thanks to the modularity, there are no more restrictions on multithreaded source programs than on sequential ones, and yet we guarantee that their compilations are provably secure for a wide class of schedulers.", acknowledgement = ack-nhfb, articleno = "21", fjournal = "ACM Transactions on Information and System Security", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J789", keywords = "compilers; Noninterference; schedulers; type systems", } @Article{Bergan:2010:CCRa, author = "Tom Bergan and Owen Anderson and Joseph Devietti and Luis Ceze and Dan Grossman", title = "{CoreDet}: a compiler and runtime system for deterministic multithreaded execution", journal = j-COMP-ARCH-NEWS, volume = "38", number = "1", pages = "53--64", month = mar, year = "2010", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Mar 17 14:42:04 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Bergan:2010:CCRb, author = "Tom Bergan and Owen Anderson and Joseph Devietti and Luis Ceze and Dan Grossman", title = "{CoreDet}: a compiler and runtime system for deterministic multithreaded execution", journal = j-SIGPLAN, volume = "45", number = "3", pages = "53--64", month = mar, year = "2010", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1736020.1736029", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Mar 17 13:46:56 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The behavior of a multithreaded program does not depend only on its inputs. Scheduling, memory reordering, timing, and low-level hardware effects all introduce nondeterminism in the execution of multithreaded programs. This severely complicates many tasks, including debugging, testing, and automatic replication. In this work, we avoid these complications by eliminating their root cause: we develop a compiler and runtime system that runs arbitrary multithreaded C/C++ POSIX Threads programs deterministically.\par A trivial nonperformant approach to providing determinism is simply deterministically serializing execution. Instead, we present a compiler and runtime infrastructure that ensures determinism but resorts to serialization rarely, for handling interthread communication and synchronization. We develop two basic approaches, both of which are largely dynamic with performance improved by some static compiler optimizations. First, an ownership-based approach detects interthread communication via an evolving table that tracks ownership of memory regions by threads. Second, a buffering approach uses versioned memory and employs a deterministic commit protocol to make changes visible to other threads. While buffering has larger single-threaded overhead than ownership, it tends to scale better (serializing less often). A hybrid system sometimes performs and scales better than either approach individually.\par Our implementation is based on the LLVM compiler infrastructure. It needs neither programmer annotations nor special hardware. Our empirical evaluation uses the PARSEC and SPLASH2 benchmarks and shows that our approach scales comparably to nondeterministic execution.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "compilers; determinism; multicore; multithreading", } @Article{Bokhari:2010:EPM, author = "Shahid Bokhari and Joel Saltz", title = "Exploring the performance of massively multithreaded architectures", journal = j-CCPE, volume = "22", number = "5", pages = "588--616", day = "10", month = apr, year = "2010", CODEN = "CCPEBO", DOI = "https://doi.org/10.1002/cpe.1484", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Mon Dec 5 10:08:42 MST 2011", bibsource = "http://www.interscience.wiley.com/jpages/1532-0626; https://www.math.utah.edu/pub/tex/bib/ccpe.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Concurrency and Computation: Prac\-tice and Experience", journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626", onlinedate = "1 Sep 2009", } @Article{Bronson:2010:PCB, author = "Nathan G. Bronson and Jared Casper and Hassan Chafi and Kunle Olukotun", title = "A practical concurrent binary search tree", journal = j-SIGPLAN, volume = "45", number = "5", pages = "257--268", month = may, year = "2010", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1693453.1693488", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Aug 31 22:39:18 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "We propose a concurrent relaxed balance AVL tree algorithm that is fast, scales well, and tolerates contention. It is based on optimistic techniques adapted from software transactional memory, but takes advantage of specific knowledge of the algorithm to reduce overheads and avoid unnecessary retries. We extend our algorithm with a fast linearizable clone operation, which can be used for consistent iteration of the tree. Experimental evidence shows that our algorithm outperforms a highly tuned concurrent skip list for many access patterns, with an average of 39\% higher single-threaded throughput and 32\% higher multi-threaded throughput over a range of contention levels and operation mixes.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "optimistic concurrency; snapshot isolation", } @Article{Buntinas:2010:MDC, author = "Darius Buntinas and Alexis J. Malozemoff and Jean Utke", title = "Multithreaded derivative computation with generated libraries", journal = j-J-COMPUT-SCI, volume = "1", number = "2", pages = "89--97", month = jun, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1016/j.jocs.2010.03.009", ISSN = "1877-7503 (print), 1877-7511 (electronic)", ISSN-L = "1877-7503", bibdate = "Tue Sep 19 13:53:02 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jcomputsci.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "https://www.sciencedirect.com/science/article/pii/S1877750310000116", acknowledgement = ack-nhfb, ajournal = "J. Comput. Sci.", fjournal = "Journal of Computational Science", journal-URL = "https://www.sciencedirect.com/journal/journal-of-computational-science", } @Article{Burnim:2010:ACD, author = "Jacob Burnim and Koushik Sen", title = "Asserting and checking determinism for multithreaded programs", journal = j-CACM, volume = "53", number = "6", pages = "97--105", month = jun, year = "2010", CODEN = "CACMA2", DOI = "https://doi.org/10.1145/1743546.1743572", ISSN = "0001-0782 (print), 1557-7317 (electronic)", ISSN-L = "0001-0782", bibdate = "Mon Jun 21 12:34:55 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/cacm/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Communications of the ACM", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J79", } @Article{Chen:2010:CCM, author = "Changno Chen and Marc Moreno Maza and Yuzhen Xie", title = "Cache complexity and multicore implementation for univariate real root isolation", journal = j-ACM-COMM-COMP-ALGEBRA, volume = "44", number = "3", pages = "97--98", month = sep, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1940475.1940483", ISSN = "1932-2232 (print), 1932-2240 (electronic)", ISSN-L = "1932-2232", bibdate = "Thu Mar 31 10:24:16 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Isolating the real roots of a univariate polynomial is a driving subject in computer algebra. This problem has been studied under various angles from algebraic algorithms [1, 2, 7] to implementation techniques [3, 5]. Today, multicores are the most popular parallel hardware architectures. Beside, understanding the implications of hierarchical memory on performance software engineering has become essential. These observations motivate our study. We analyze the cache complexity of the core routine of many real root isolation algorithms namely, the Taylor shift. Then, we present efficient multithreaded implementation on multicores.", acknowledgement = ack-nhfb, fjournal = "ACM Communications in Computer Algebra", issue = "173", } @Article{Chetlur:2010:SWM, author = "M. Chetlur and U. Devi and P. Dutta and P. Gupta and L. Chen and Z. Zhu and S. Kalyanaraman and Y. Lin", title = "A software {WiMAX} medium access control layer using massively multithreaded processors", journal = j-IBM-JRD, volume = "54", number = "1", pages = "??--??", month = "????", year = "2010", CODEN = "IBMJAE", ISSN = "0018-8646 (print), 2151-8556 (electronic)", ISSN-L = "0018-8646", bibdate = "Sat May 1 17:44:14 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.research.ibm.com/journal/", URL = "http://www.research.ibm.com/journal/abstracts/rd/541/chetlur-dutta.html", acknowledgement = ack-nhfb, articleno = "9", fjournal = "IBM Journal of Research and Development", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5288520", } @Article{Choi:2010:MDA, author = "Jee W. Choi and Amik Singh and Richard W. Vuduc", title = "Model-driven autotuning of sparse matrix-vector multiply on {GPUs}", journal = j-SIGPLAN, volume = "45", number = "5", pages = "115--126", month = may, year = "2010", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1693453.1693471", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Aug 31 22:39:18 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "We present a performance model-driven framework for automated performance tuning (autotuning) of sparse matrix-vector multiply (SpMV) on systems accelerated by graphics processing units (GPU). Our study consists of two parts.\par First, we describe several carefully hand-tuned SpMV implementations for GPUs, identifying key GPU-specific performance limitations, enhancements, and tuning opportunities. These implementations, which include variants on classical blocked compressed sparse row (BCSR) and blocked ELLPACK (BELLPACK) storage formats, match or exceed state-of-the-art implementations. For instance, our best BELLPACK implementation achieves up to 29.0 Gflop/s in single-precision and 15.7 Gflop/s in double-precision on the NVIDIA T10P multiprocessor (C1060), enhancing prior state-of-the-art unblocked implementations (Bell and Garland, 2009) by up to 1.8\times and 1.5\times for single-and double-precision respectively.\par However, achieving this level of performance requires input matrix-dependent parameter tuning. Thus, in the second part of this study, we develop a performance model that can guide tuning. Like prior autotuning models for CPUs (e.g., Im, Yelick, and Vuduc, 2004), this model requires offline measurements and run-time estimation, but more directly models the structure of multithreaded vector processors like GPUs. We show that our model can identify the implementations that achieve within 15\% of those found through exhaustive search.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "GPU; performance modeling; sparse matrix-vector multiplication", } @Article{Coons:2010:GEU, author = "Katherine E. Coons and Sebastian Burckhardt and Madanlal Musuvathi", title = "{GAMBIT}: effective unit testing for concurrency libraries", journal = j-SIGPLAN, volume = "45", number = "5", pages = "15--24", month = may, year = "2010", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1837853.1693458", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Aug 31 22:39:18 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "As concurrent programming becomes prevalent, software providers are investing in concurrency libraries to improve programmer productivity. Concurrency libraries improve productivity by hiding error-prone, low-level synchronization from programmers and providing higher-level concurrent abstractions. Testing such libraries is difficult, however, because concurrency failures often manifest only under particular scheduling circumstances. Current best testing practices are often inadequate: heuristic-guided fuzzing is not systematic, systematic schedule enumeration does not find bugs quickly, and stress testing is neither systematic nor fast.\par To address these shortcomings, we propose a prioritized search technique called GAMBIT that combines the speed benefits of heuristic-guided fuzzing with the soundness, progress, and reproducibility guarantees of stateless model checking. GAMBIT combines known techniques such as partial-order reduction and preemption-bounding with a generalized best-first search frame- work that prioritizes schedules likely to expose bugs. We evaluate GAMBIT's effectiveness on newly released concurrency libraries for Microsoft's .NET framework. Our experiments show that GAMBIT finds bugs more quickly than prior stateless model checking techniques without compromising coverage guarantees or reproducibility.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "concurrency; model checking; multithreading; partial-order reduction; preemption bound; software testing", } @Article{Dam:2010:PCI, author = "Mads Dam and Bart Jacobs and Andreas Lundblad and Frank Piessens", title = "Provably correct inline monitoring for multithreaded {Java}-like programs", journal = j-J-COMP-SECUR, volume = "18", number = "1", pages = "37--59", month = "????", year = "2010", CODEN = "JCSIET", DOI = "https://doi.org/10.3233/JCS-2010-0365", ISSN = "0926-227X (print), 1875-8924 (electronic)", ISSN-L = "0926-227X", bibdate = "Tue May 24 06:24:34 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/java2010.bib; https://www.math.utah.edu/pub/tex/bib/jcompsecur.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Journal of Computer Security", journal-URL = "http://content.iospress.com/journals/journal-of-computer-security", } @Article{Ding:2010:PCM, author = "Jason Jianxun Ding and Abdul Waheed and Jingnan Yao and Laxmi N. Bhuyan", title = "Performance characterization of multi-thread and multi-core processors based {XML} application oriented networking systems", journal = j-J-PAR-DIST-COMP, volume = "70", number = "5", pages = "584--597", month = may, year = "2010", CODEN = "JPDCER", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Wed Sep 1 16:27:28 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.sciencedirect.com/science/journal/07437315", acknowledgement = ack-nhfb, fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315", } @Article{Dohi:2010:IPE, author = "Keisuke Dohi and Yuichiro Shibata and Tsuyoshi Hamada and Tomonari Masada and Kiyoshi Oguri and Duncan A. Buell", title = "Implementation of a programming environment with a multithread model for reconfigurable systems", journal = j-COMP-ARCH-NEWS, volume = "38", number = "4", pages = "40--45", month = sep, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1926367.1926375", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 20 14:27:03 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Reconfigurable systems are known to be able to achieve higher performance than traditional microprocessor architecture for many application fields. However, in order to extract a full potential of the reconfigurable systems, programmers often have to design and describe the best suited code for their target architecture with specialized knowledge. The aim of this paper is to assist the users of reconfigurable systems by implementing a translator with a multithread model. The experimental results show our translator automatically generates efficient performance-aware code segments including DMA transfer and shift registers for memory access optimization.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Eggers:2010:AL, author = "Susan Eggers", title = "{2010 Athena} lecture", journal = j-SIGPLAN, volume = "45", number = "6", pages = "98--98", month = jun, year = "2010", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1809028.1806608", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Fri Oct 8 17:53:18 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Susan Eggers, a Professor of Computer Science and Engineering at the University of Washington, joined her department in 1989. She received a B.A. in 1965 from Connecticut College and a Ph.D. in 1989 from the University of California, Berkeley. Her research interests are in computer architecture and back-end compiler optimization, with an emphasis on experimental performance analysis. With her colleague Hank Levy and their students, she developed the first commercially viable multithreaded architecture, Simultaneous Multithreading, adopted by Intel (as Hyperthreading), IBM, Sun and others. Her current research is in the areas of distributed dataflow machines, FPGAs and chip multiprocessors. In 1989 Professor Eggers was awarded an IBM Faculty Development Award, in 1990 an NSF Presidential Young Investigator Award, in 1994 the Microsoft Professorship in Computer Science and Engineering, and in 2009 the ACM-W Athena Lecturer. She is a Fellow of the ACM and IEEE, a Fellow of the AAAS, and a member of the National Academy of Engineering.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "invited talk", } @Article{Eyerman:2010:PJS, author = "Stijn Eyerman and Lieven Eeckhout", title = "Probabilistic job symbiosis modeling for {SMT} processor scheduling", journal = j-SIGPLAN, volume = "45", number = "3", pages = "91--102", month = mar, year = "2010", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1736020.1736033", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Mar 17 13:46:56 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Symbiotic job scheduling boosts simultaneous multithreading (SMT) processor performance by co-scheduling jobs that have `compatible' demands on the processor's shared resources. Existing approaches however require a sampling phase, evaluate a limited number of possible co-schedules, use heuristics to gauge symbiosis, are rigid in their optimization target, and do not preserve system-level priorities/shares.\par This paper proposes probabilistic job symbiosis modeling, which predicts whether jobs will create positive or negative symbiosis when co-scheduled without requiring the co-schedule to be evaluated. The model, which uses per-thread cycle stacks computed through a previously proposed cycle accounting architecture, is simple enough to be used in system software. Probabilistic job symbiosis modeling provides six key innovations over prior work in symbiotic job scheduling: (i) it does not require a sampling phase, (ii) it readjusts the job co-schedule continuously, (iii) it evaluates a large number of possible co-schedules at very low overhead, (iv) it is not driven by heuristics, (v) it can optimize a performance target of interest (e.g., system throughput or job turnaround time), and (vi) it preserves system-level priorities/shares. These innovations make symbiotic job scheduling both practical and effective.\par Our experimental evaluation, which assumes a realistic scenario in which jobs come and go, reports an average 16\% (and up to 35\%) reduction in job turnaround time compared to the previously proposed SOS (sample, optimize, symbios) approach for a two-thread SMT processor, and an average 19\% (and up to 45\%) reduction in job turnaround time for a four-thread SMT processor.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "performance modeling; simultaneous multi-threading (SMT); symbiotic job scheduling", } @Article{Flanagan:2010:AMD, author = "Cormac Flanagan and Stephen N. Freund", title = "Adversarial memory for detecting destructive races", journal = j-SIGPLAN, volume = "45", number = "6", pages = "244--254", month = jun, year = "2010", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1806596.1806625", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Fri Oct 8 17:53:18 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/java2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "Multithreaded programs are notoriously prone to race conditions, a problem exacerbated by the widespread adoption of multi-core processors with complex memory models and cache coherence protocols. Much prior work has focused on static and dynamic analyses for race detection, but these algorithms typically are unable to distinguish destructive races that cause erroneous behavior from benign races that do not. Performing this classification manually is difficult, time consuming, and error prone.\par This paper presents a new dynamic analysis technique that uses {\em adversarial memory\/} to classify race conditions as destructive or benign on systems with relaxed memory models. Unlike a typical language implementation, which may only infrequently exhibit non-sequentially consistent behavior, our adversarial memory implementation exploits the full freedom of the memory model to return older, unexpected, or stale values for memory reads whenever possible, in an attempt to crash the target program (that is, to force the program to behave erroneously). A crashing execution provides concrete evidence of a destructive bug, and this bug can be strongly correlated with a specific race condition in the target program.\par Experimental results with our Jumble prototype for Java demonstrate that adversarial memory is highly effective at identifying destructive race conditions, and in distinguishing them from race conditions that are real but benign. Adversarial memory can also reveal destructive races that would not be detected by traditional testing (even after thousands of runs) or by model checkers that assume sequential consistency.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "concurrency; dynamic analysis; race conditions; relaxed memory models", } @Article{Gibson:2010:FSC, author = "Dan Gibson and David A. Wood", title = "{Forwardflow}: a scalable core for power-constrained {CMPs}", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "14--25", month = jun, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1816038.1815966", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Chip Multiprocessors (CMPs) are now commodity hardware, but commoditization of parallel software remains elusive. In the near term, the current trend of increased core-per-socket count will continue, despite a lack of parallel software to exercise the hardware. Future CMPs must deliver thread-level parallelism when software provides threads to run, but must also continue to deliver performance gains for single threads by exploiting instruction-level parallelism and memory-level parallelism. However, power limitations will prevent conventional cores from exploiting both simultaneously.\par This work presents the Forwardflow Architecture, which can scale its execution logic up to run single threads, or down to run multiple threads in a CMP. Forwardflow dynamically builds an explicit internal dataflow representation from a conventional instruction set architecture, using forward dependence pointers to guide instruction wakeup, selection, and issue. Forwardflow's backend is organized into discrete units that can be individually (de-)activated, allowing each core's performance to be scaled by system software at the architectural level.\par On single threads, Forwardflow core scaling yields a mean runtime reduction of 21\% for a 37\% increase in power consumption. For multithreaded workloads, a Forwardflow-based CMP allows system software to select the performance point that best matches available power.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", keywords = "chip multiprocessor (cmp); power; scalable core", } @Article{Gupta:2010:CSM, author = "M. Gupta and F. Sanchez and J. Llosa", title = "{CSMT}: Simultaneous Multithreading for Clustered {VLIW} Processors", journal = j-IEEE-TRANS-COMPUT, volume = "59", number = "3", pages = "385--399", month = mar, year = "2010", CODEN = "ITCOB4", DOI = "https://doi.org/10.1109/TC.2009.96", ISSN = "0018-9340 (print), 1557-9956 (electronic)", ISSN-L = "0018-9340", bibdate = "Sun Jul 3 11:52:26 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=5161255", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Computers", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12", } @Article{Hilton:2010:SDE, author = "Andrew Hilton and Amir Roth", title = "{SMT-Directory}: Efficient Load-Load Ordering for {SMT}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "9", number = "1", pages = "25--28", month = jan # "\slash " # jun, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2010.8", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Jun 20 17:18:18 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Memory models like SC, TSO, and PC enforce load-load ordering, requiring that loads from any single thread appear to occur in program order to all other threads. Out-of-order execution can violate load-load ordering. Conventional multi-processors with out-of-order cores detect load-load ordering violations by snooping an age-ordered load queue on cache invalidations or evictions-events that act as proxies for the completion of remote stores. This mechanism becomes less efficient in an SMT processor, as every completing store must search the loads queue segments of all other threads. This inefficiency exists because store completions from other threads in the same core are not filtered by the cache and coherence protocol: thread 0 observes all of thread 1's stores, not only the first store to every cache line. SMT-Directory eliminates this overhead by implementing the filtering traditionally provided by the cache in the cache itself. SMT-Directory adds a per-thread ``{read''} bit to every data cache line. When a load executes, it sets the bit corresponding to its thread. When a store completes and write to the cache, it checks the SMT-Directory bits of its cache line and searches the load queue segments only of those threads whose bits are set. As a result, local store completions trigger searches only for data that is actually shared.", acknowledgement = ack-nhfb, affiliation = "Hilton, A (Reprint Author), Univ Penn, Philadelphia, PA 19104 USA. Hilton, Andrew; Roth, Amir, Univ Penn, Philadelphia, PA 19104 USA.", da = "2019-06-20", doc-delivery-number = "731BP", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "NSF [CCF-0541292]", funding-text = "We thank Arun Raghavan for the address traces and Milo Martin for comments on early versions of this work. The anonymous reviewers provided valuable feedback. This work was supported by NSF award CCF-0541292.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "consistency models; directory; load queue search; load-load ordering; Simultaneous multithreading", keywords-plus = "CONSISTENCY", number-of-cited-references = "9", research-areas = "Computer Science", times-cited = "0", unique-id = "Hilton:2010:SDE", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Illikkal:2010:PQP, author = "Ramesh Illikkal and Vineet Chadha and Andrew Herdrich and Ravi Iyer and Donald Newell", title = "{PIRATE}: {QoS} and performance management in {CMP} architectures", journal = j-SIGMETRICS, volume = "37", number = "4", pages = "3--10", month = mar, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1773394.1773396", ISSN = "0163-5999 (print), 1557-9484 (electronic)", ISSN-L = "0163-5999", bibdate = "Wed Aug 25 07:35:13 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "As new multi-threaded usage models such as virtualization and consolidation take advantage of multiple cores in CMP architectures, the impact of shared resource contention between VMs and user-level applications introduces Quality of Service(QoS) concerns and challenges. QoS-aware management of these shared platform resources is therefore becoming increasingly important. Various QoS schemes for resource management have been recently proposed, but most of these prior efforts have been focused on controlling individual resource allocation based on priority information passed down from the OS or Hypervisor to system resources. The complexity of this approach increases when multiple levels of resources are associated with an application's performance and power consumption. In this paper we employ simpler rate-based QoS mechanisms which control the execution rate of competing applications. To enable differentiation between simultaneously running applications' performance and power consumption, these rate mechanisms need to dynamically adjust the execution of application. Our proposed PI-RATE architecture introduces a control-theoretic approach to dynamically adjust the execution rate of each application based on the QoS target and monitored resource utilization. We evaluate three modes of PI-RATE architecture --- cache QoS targets, performance QoS targets and power QoS targets --- to show that the PI-RATE architecture is flexible and effective at enabling QoS in a CMP platform.", acknowledgement = ack-nhfb, fjournal = "ACM SIGMETRICS Performance Evaluation Review", journal-URL = "http://portal.acm.org/toc.cfm?id=J618", keywords = "clock modulation; frequency scaling; integral controller; proportional", } @Article{Jang:2010:DTE, author = "Byunghyun Jang and Perhaad Mistry and Dana Schaa and Rodrigo Dominguez and David Kaeli", title = "Data transformations enabling loop vectorization on multithreaded data parallel architectures", journal = j-SIGPLAN, volume = "45", number = "5", pages = "353--354", month = may, year = "2010", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1837853.1693510", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Aug 31 22:39:18 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Loop vectorization, a key feature exploited to obtain high performance on Single Instruction Multiple Data (SIMD) vector architectures, is significantly hindered by irregular memory access patterns in the data stream. This paper describes data transformations that allow us to vectorize loops targeting massively multithreaded data parallel architectures. We present a mathematical model that captures loop-based memory access patterns and computes the most appropriate data transformations in order to enable vectorization. Our experimental results show that the proposed data transformations can significantly increase the number of loops that can be vectorized and enhance the data-level parallelism of applications. Our results also show that the overhead associated with our data transformations can be easily amortized as the size of the input data set increases. For the set of high performance benchmark kernels studied, we achieve consistent and significant performance improvements (up to 11.4X) by applying vectorization using our data transformation approach.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "data transformation; GPGPU; loop vectorization", } @Article{Laadan:2010:TLA, author = "Oren Laadan and Nicolas Viennot and Jason Nieh", title = "Transparent, lightweight application execution replay on commodity multiprocessor operating systems", journal = j-SIGMETRICS, volume = "38", number = "1", pages = "155--166", month = jun, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1811039.1811057", ISSN = "0163-5999 (print), 1557-9484 (electronic)", ISSN-L = "0163-5999", bibdate = "Wed Aug 25 07:35:52 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "We present Scribe, the first system to provide transparent, low-overhead application record-replay and the ability to go live from replayed execution. Scribe introduces new lightweight operating system mechanisms, rendezvous and sync points, to efficiently record nondeterministic interactions such as related system calls, signals, and shared memory accesses. Rendezvous points make a partial ordering of execution based on system call dependencies sufficient for replay, avoiding the recording overhead of maintaining an exact execution ordering. Sync points convert asynchronous interactions that can occur at arbitrary times into synchronous events that are much easier to record and replay.\par We have implemented Scribe without changing, relinking, or recompiling applications, libraries, or operating system kernels, and without any specialized hardware support such as hardware performance counters. It works on commodity Linux operating systems, and commodity multi-core and multiprocessor hardware. Our results show for the first time that an operating system mechanism can correctly and transparently record and replay multi-process and multi-threaded applications on commodity multiprocessors. Scribe recording overhead is less than 2.5\% for server applications including Apache and MySQL, and less than 15\% for desktop applications including Firefox, Acrobat, OpenOffice, parallel kernel compilation, and movie playback.", acknowledgement = ack-nhfb, fjournal = "ACM SIGMETRICS Performance Evaluation Review", journal-URL = "http://portal.acm.org/toc.cfm?id=J618", keywords = "debugging; fault-tolerance; record-replay; virtualization", } @Article{Lee:2010:REO, author = "Dongyoon Lee and Benjamin Wester and Kaushik Veeraraghavan and Satish Narayanasamy and Peter M. Chen and Jason Flinn", title = "{Respec}: efficient online multiprocessor replay via speculation and external determinism", journal = j-SIGPLAN, volume = "45", number = "3", pages = "77--90", month = mar, year = "2010", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1736020.1736031", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Mar 17 13:46:56 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Deterministic replay systems record and reproduce the execution of a hardware or software system. While it is well known how to replay uniprocessor systems, replaying shared memory multiprocessor systems at low overhead on commodity hardware is still an open problem. This paper presents Respec, a new way to support deterministic replay of shared memory multithreaded programs on commodity multiprocessor hardware. Respec targets online replay in which the recorded and replayed processes execute concurrently.\par Respec uses two strategies to reduce overhead while still ensuring correctness: speculative logging and externally deterministic replay. Speculative logging optimistically logs less information about shared memory dependencies than is needed to guarantee deterministic replay, then recovers and retries if the replayed process diverges from the recorded process. Externally deterministic replay relaxes the degree to which the two executions must match by requiring only their system output and final program states match. We show that the combination of these two techniques results in low recording and replay overhead for the common case of data-race-free execution intervals and still ensures correct replay for execution intervals that have data races.\par We modified the Linux kernel to implement our techniques. Our software system adds on average about 18\% overhead to the execution time for recording and replaying programs with two threads and 55\% overhead for programs with four threads.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "external determinism; replay; speculative execution", } @Article{Lin:2010:TAC, author = "Yi-Neng Lin and Ying-Dar Lin and Yuan-Cheng Lai", title = "Thread allocation in {CMP}-based multithreaded network processors", journal = j-PARALLEL-COMPUTING, volume = "36", number = "2--3", pages = "104--116", month = feb # "\slash " # mar, year = "2010", CODEN = "PACOEJ", ISSN = "0167-8191 (print), 1872-7336 (electronic)", ISSN-L = "0167-8191", bibdate = "Thu Sep 2 17:51:12 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.sciencedirect.com/science/journal/01678191", acknowledgement = ack-nhfb, fjournal = "Parallel Computing", journal-URL = "http://www.sciencedirect.com/science/journal/01678191", } @Article{Mannarswamy:2010:CAS, author = "Sandya Mannarswamy and Dhruva R. Chakrabarti and Kaushik Rajan and Sujoy Saraswati", title = "Compiler aided selective lock assignment for improving the performance of software transactional memory", journal = j-SIGPLAN, volume = "45", number = "5", pages = "37--46", month = may, year = "2010", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1693453.1693460", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Aug 31 22:39:18 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Atomic sections have been recently introduced as a language construct to improve the programmability of concurrent software. They simplify programming by not requiring the explicit specification of locks for shared data. Typically atomic sections are supported in software either through the use of optimistic concurrency by using transactional memory or through the use of pessimistic concurrency using compiler-assigned locks. As a software transactional memory (STM) system does not take advantage of the specific memory access patterns of an application it often suffers from false conflicts and high validation overheads. On the other hand, the compiler usually ends up assigning coarse grain locks as it relies on whole program points-to analysis which is conservative by nature. This adversely affects performance by limiting concurrency. In order to mitigate the disadvantages associated with STM's lock assignment scheme, we propose a hybrid approach which combines STM's lock assignment with a compiler aided selective lock assignment scheme (referred to as SCLA-STM). SCLA-STM overcomes the inefficiencies associated with a purely compile-time lock assignment approach by (i) using the underlying STM for shared variables where only a conservative analysis is possible by the compiler (e.g., in the presence of may-alias points to information) and (ii) being selective about the shared data chosen for the compiler-aided lock assignment. We describe our prototype SCLA-STM scheme implemented in the HP-UX IA-64 C/C++ compiler, using TL2 as our STM implementation. We show that SCLA-STM improves application performance for certain STAMP benchmarks from 1.68\% to 37.13\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "compilers; multithreading; parallelization; performance", } @Article{Marino:2010:DSE, author = "Daniel Marino and Abhayendra Singh and Todd Millstein and Madanlal Musuvathi and Satish Narayanasamy", title = "{DRFX}: a simple and efficient memory model for concurrent programming languages", journal = j-SIGPLAN, volume = "45", number = "6", pages = "351--362", month = jun, year = "2010", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1806596.1806636", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Fri Oct 8 17:53:18 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The most intuitive memory model for shared-memory multithreaded programming is {\em sequential consistency\/} (SC), but it disallows the use of many compiler and hardware optimizations thereby impacting performance. Data-race-free (DRF) models, such as the proposed C++0x memory model, guarantee SC execution for datarace-free programs. But these models provide no guarantee at all for racy programs, compromising the safety and debuggability of such programs. To address the safety issue, the Java memory model, which is also based on the DRF model, provides a weak semantics for racy executions. However, this semantics is subtle and complex, making it difficult for programmers to reason about their programs and for compiler writers to ensure the correctness of compiler optimizations.\par We present the DRFx memory model, which is simple for programmers to understand and use while still supporting many common optimizations. We introduce a {\em memory model (MM) exception\/} which can be signaled to halt execution. If a program executes without throwing this exception, then DRFx guarantees that the execution is SC. If a program throws an MM exception during an execution, then DRFx guarantees that the program has a data race. We observe that SC violations can be detected in hardware through a lightweight form of conflict detection. Furthermore, our model safely allows aggressive compiler and hardware optimizations within compiler-designated program regions. We formalize our memory model, prove several properties about this model, describe a compiler and hardware design suitable for DRFx, and evaluate the performance overhead due to our compiler and hardware requirements.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "data races; memory model exception; memory models; sequential consistency; soft fences", } @Article{McKenney:2010:WGM, author = "Paul E. McKenney and Maged M. Michael and Josh Triplett and Jonathan Walpole", title = "Why the grass may not be greener on the other side: a comparison of locking vs. transactional memory", journal = j-OPER-SYS-REV, volume = "44", number = "3", pages = "93--101", month = jul, year = "2010", CODEN = "OSRED8", DOI = "https://doi.org/10.1145/1842733.1842749", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Thu Aug 19 14:21:54 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The advent of multi-core and multi-threaded processor architectures highlights the need to address the well-known shortcomings of the ubiquitous lock-based synchronization mechanisms. To this end, transactional memory has been viewed by many as a promising alternative to locking. This paper therefore presents a constructive critique of locking and transactional memory: their strengths, weaknesses, and opportunities for improvement.", acknowledgement = ack-nhfb, fjournal = "ACM SIGOPS Operating Systems Review", } @Article{Meng:2010:AOS, author = "Lingchuan Meng and Jeremy Johnson and Franz Franchetti and Yevgen Voronenko and Marc Moreno Maza and Yuzhen Xie", title = "Abstract only: {SPIRAL}-generated modular {FFTs}", journal = j-ACM-COMM-COMP-ALGEBRA, volume = "44", number = "2", pages = "25--26", month = jun, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1838599.1838616", ISSN = "1932-2232 (print), 1932-2240 (electronic)", ISSN-L = "1932-2232", bibdate = "Mon Aug 2 13:47:24 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "In this poster we present the use of the SPIRAL system (www.spiral.net) to generate code for modular Fast Fourier Transforms (FFTs). SPIRAL is a library generation system that automatically generates platform-tuned implementations of digital signal processing algorithms with an emphasis on fast transforms. Currently, SPRIAL can generate highly optimized fixed point and floating-point FFTs for a variety of platforms including vectorization, multi-threaded and distributed memory parallelization. The code produced is competitive with the best available code for these platforms and SPIRAL is used by Intel for its IPP (Intel Performance Primitives) and MKL (Math kernel Library) libraries.\par The SPIRAL system uses a mathematical framework for representing and deriving algorithms. Algorithms are derived using rewrite rules and additional rules are used to symbolically manipulate algorithms into forms that take advantage of the underlying hardware. A search engine with a feedback loop is used to tune implementations to particular platforms. New transforms are added by introducing new symbols and their definition and new algorithms can be generated by adding new rules.\par We extended SPIRAL to generate algorithms for FFT computation over finite fields. This addition required adding a new data type, several new rules and a new transform (ModDFT) definition. In addition, the unparser (where code is generated) was extended so that it can generate scalar and vectorized code for modular arithmetic. With these enhancements, the SPRIAL machinery can be applied to modular transforms that are of interest to the computer algebra community. This provides a framework for systematically optimizing these transforms, utilizing vector and parallel computation, and for automatically tuning them to different platforms. In this poster we present preliminary results from this exploration. We show that the code generated by SPIRAL, with improved cache locality and vectorization, is approximately ten times faster than the modular FFT code in the modpn library.", acknowledgement = ack-nhfb, fjournal = "ACM Communications in Computer Algebra", issue = "172", } @Article{Meng:2010:DWS, author = "Jiayuan Meng and David Tarjan and Kevin Skadron", title = "Dynamic warp subdivision for integrated branch and memory divergence tolerance", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "235--246", month = jun, year = "2010", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1815961.1815992", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "SIMD organizations amortize the area and power of fetch, decode, and issue logic across multiple processing units in order to maximize throughput for a given area and power budget. However, throughput is reduced when a set of threads operating in lockstep (a warp) are stalled due to long latency memory accesses. The resulting idle cycles are extremely costly. Multi-threading can hide latencies by interleaving the execution of multiple warps, but deep multi-threading using many warps dramatically increases the cost of the register files (multi-threading depth $ \times $ SIMD width), and cache contention can make performance worse. Instead, intra-warp latency hiding should first be exploited. This allows threads that are ready but stalled by SIMD restrictions to use these idle cycles and reduces the need for multi-threading among warps. This paper introduces {\em dynamic warp subdivision\/} (DWS), which allows a single warp to occupy more than one slot in the scheduler without requiring extra register file space. Independent scheduling entities allow divergent branch paths to interleave their execution, and allow threads that hit to run ahead. The result is improved latency hiding and memory level parallelism (MLP). We evaluate the technique on a coherent cache hierarchy with private L1 caches and a shared L2 cache. With an area overhead of less than 1\%, experiments with eight data-parallel benchmarks show our technique improves performance on average by 1.7$ \times $.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", keywords = "branch divergence; cache; latency hiding; memory divergence; SIMD; warp", } @Article{Muralidhara:2010:IAS, author = "Sai Prashanth Muralidhara and Mahmut Kandemir and Padma Raghavan", title = "Intra-application shared cache partitioning for multithreaded applications", journal = j-SIGPLAN, volume = "45", number = "5", pages = "329--330", month = may, year = "2010", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1837853.1693498", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Aug 31 22:39:18 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "In this paper, we address the problem of partitioning a shared cache when the executing threads belong to the same application.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "cache; multicore; parallel applications", } @Article{Nakaike:2010:LER, author = "Takuya Nakaike and Maged M. Michael", title = "Lock elision for read-only critical sections in {Java}", journal = j-SIGPLAN, volume = "45", number = "6", pages = "269--278", month = jun, year = "2010", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1806596.1806627", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Fri Oct 8 17:53:18 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "It is not uncommon in parallel workloads to encounter shared data structures with read-mostly access patterns, where operations that update data are infrequent and most operations are read-only. Typically, data consistency is guaranteed using mutual exclusion or read-write locks. The cost of atomic update of lock variables result in high overheads and high cache coherence traffic under active sharing, thus slowing down single thread performance and limiting scalability.\par In this paper, we present {\em SOLERO (Software Optimistic Lock Elision for Read-Only critical sections)}, a new lock implementation called for optimizing read-only critical sections in Java based on sequential locks. SOLERO is compatible with the conventional lock implementation of Java. However, unlike the conventional implementation, only critical sections that may write data or have side effects need to update lock variables, while read-only critical sections need only read lock variables without writing them. Each writing critical section changes the lock value to a new value. Hence, a read-only critical section is guaranteed to be consistent if the lock is free and its value does not change from the beginning to the end of the read-only critical section.\par Using Java workloads including SPECjbb2005 and the HashMap and TreeMap Java classes, we evaluate the performance impact of applying SOLERO to read-mostly locks. Our experimental results show performance improvements across the board, often substantial, in both single thread speed and scalability over the conventional lock implementation (mutual exclusion) and read-write locks. SOLERO improves the performance of SPECjbb2005 by 3-5\% on single and multiple threads. The results using the HashMap and TreeMap benchmarks show that SOLERO outperforms the conventional lock implementation and read-write locks by substantial multiples on multi-threads.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "java; just-in-time compiler; lock; lock elision; monitor; optimization; synchronization", } @Article{Park:2010:ISP, author = "Jung-Wook Park and Hoon-Mo Yang and Gi-Ho Park and Shin-Dug Kim and Charles C. Weems", title = "An instruction-systolic programmable shader architecture for multi-threaded {$3$D} graphics processing", journal = j-J-PAR-DIST-COMP, volume = "70", number = "11", pages = "1110--1118", month = nov, year = "2010", CODEN = "JPDCER", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Wed Sep 1 16:27:29 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.sciencedirect.com/science/journal/07437315", acknowledgement = ack-nhfb, fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315", } @Article{Radojkovic:2010:TSB, author = "Petar Radojkovi{\'c} and Vladimir {\v{C}}akarevi{\'c} and Javier Verd{\'u} and Alex Pajuelo and Francisco J. Cazorla and Mario Nemirovsky and Mateo Valero", title = "Thread to strand binding of parallel network applications in massive multi-threaded systems", journal = j-SIGPLAN, volume = "45", number = "5", pages = "191--202", month = may, year = "2010", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1837853.1693480", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Aug 31 22:39:18 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "In processors with several levels of hardware resource sharing,like CMPs in which each core is an SMT, the scheduling process becomes more complex than in processors with a single level of resource sharing, such as pure-SMT or pure-CMP processors. Once the operating system selects the set of applications to simultaneously schedule on the processor (workload), each application/thread must be assigned to one of the hardware contexts(strands). We call this last scheduling step the Thread to Strand Binding or TSB. In this paper, we show that the TSB impact on the performance of processors with several levels of shared resources is high. We measure a variation of up to 59\% between different TSBs of real multithreaded network applications running on the UltraSPARC T2 processor which has three levels of resource sharing. In our view, this problem is going to be more acute in future multithreaded architectures comprising more cores, more contexts per core, and more levels of resource sharing.\par We propose a resource-sharing aware TSB algorithm (TSBSched) that significantly facilitates the problem of thread to strand binding for software-pipelined applications, representative of multithreaded network applications. Our systematic approach encapsulates both, the characteristics of multithreaded processors under the study and the structure of the software pipelined applications. Once calibrated for a given processor architecture, our proposal does not require hardware knowledge on the side of the programmer, nor extensive profiling of the application. We validate our algorithm on the UltraSPARC T2 processor running a set of real multithreaded network applications on which we report improvements of up to 46\% compared to the current state-of-the-art dynamic schedulers.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "CMT; process scheduling; simultaneous multithreading; UltraSPARC T2", } @Article{Rakvic:2010:TMT, author = "R. Rakvic and Q. Cai and J. Gonz{\'a}lez and G. Magklis and P. Chaparro and A. Gonz{\'a}lez", title = "Thread-management techniques to maximize efficiency in multicore and simultaneous multithreaded microprocessors", journal = j-TACO, volume = "7", number = "2", pages = "9:1--9:??", month = sep, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1839667.1839671", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Oct 2 18:05:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "We provide an analysis of thread-management techniques that increase performance or reduce energy in multicore and Simultaneous Multithreaded (SMT) cores. Thread delaying reduces energy consumption by running the core containing the critical thread at maximum frequency while scaling down the frequency and voltage of the cores containing noncritical threads. In this article, we provide an insightful breakdown of thread delaying on a simulated multi-core microprocessor. Thread balancing improves overall performance by giving higher priority to the critical thread in the issue queue of an SMT core. We provide a detailed breakdown of performance results for thread-balancing, identifying performance benefits and limitations. For those benchmarks where a performance benefit is not possible, we introduce a novel thread-balancing mechanism on an SMT core that can reduce energy consumption. We have performed a detailed study on an Intel microprocessor simulator running parallel applications. Thread delaying can reduce energy consumption by 4\% to 44\% with negligible performance loss. Thread balancing can increase performance by 20\% or can reduce energy consumption by 23\%.", acknowledgement = ack-nhfb, articleno = "9", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924", keywords = "critical threads; energy-aware; low-power; Meeting point thread characterization; microarchitecture; multi-threaded application; thread balancing; thread delaying", } @Article{Raman:2010:SPUa, author = "Arun Raman and Hanjun Kim and Thomas R. Mason and Thomas B. Jablin and David I. August", title = "Speculative parallelization using software multi-threaded transactions", journal = j-COMP-ARCH-NEWS, volume = "38", number = "1", pages = "65--76", month = mar, year = "2010", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Mar 17 14:42:04 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Raman:2010:SPUb, author = "Arun Raman and Hanjun Kim and Thomas R. Mason and Thomas B. Jablin and David I. August", title = "Speculative parallelization using software multi-threaded transactions", journal = j-SIGPLAN, volume = "45", number = "3", pages = "65--76", month = mar, year = "2010", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1736020.1736030", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Mar 17 13:46:56 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "With the right techniques, multicore architectures may be able to continue the exponential performance trend that elevated the performance of applications of all types for decades. While many scientific programs can be parallelized without speculative techniques, speculative parallelism appears to be the key to continuing this trend for general-purpose applications. Recently-proposed code parallelization techniques, such as those by Bridges et al. and by Thies et al., demonstrate scalable performance on multiple cores by using speculation to divide code into atomic units (transactions) that span multiple threads in order to expose data parallelism. Unfortunately, most software and hardware Thread-Level Speculation (TLS) memory systems and transactional memories are not sufficient because they only support single-threaded atomic units. Multi-threaded Transactions (MTXs) address this problem, but they require expensive hardware support as currently proposed in the literature. This paper proposes a Software MTX (SMTX) system that captures the {\em applicability\/} and {\em performance\/} of hardware MTX, but on {\em existing multicore machines}. The SMTX system yields a harmonic mean speedup of 13.36x on native hardware with four 6-core processors (24 cores in total) running speculatively parallelized applications.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "automatic parallelization; loop-level parallelism; multi-threaded transactions; pipelined parallelism; software transactional memory; thread-level speculation", } @Article{Rashid:2010:AEP, author = "Layali Rashid and Wessam M. Hassanein and Moustafa A. Hammad", title = "Analyzing and enhancing the parallel sort operation on multithreaded architectures", journal = j-J-SUPERCOMPUTING, volume = "53", number = "2", pages = "293--312", month = aug, year = "2010", CODEN = "JOSUED", ISSN = "0920-8542 (print), 1573-0484 (electronic)", ISSN-L = "0920-8542", bibdate = "Wed Aug 25 08:39:00 MDT 2010", bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=53&issue=2; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=53&issue=2&spage=293", acknowledgement = ack-nhfb, fjournal = "The Journal of Supercomputing", journal-URL = "http://link.springer.com/journal/11227", } @Article{Sanchez:2010:ACI, author = "Daniel Sanchez and George Michelogiannakis and Christos Kozyrakis", title = "An analysis of on-chip interconnection networks for large-scale chip multiprocessors", journal = j-TACO, volume = "7", number = "1", pages = "4:1--4:??", month = apr, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1756065.1736069", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed May 5 15:38:13 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "With the number of cores of chip multiprocessors (CMPs) rapidly growing as technology scales down, connecting the different components of a CMP in a scalable and efficient way becomes increasingly challenging. In this article, we explore the architectural-level implications of interconnection network design for CMPs with up to 128 fine-grain multithreaded cores. We evaluate and compare different network topologies using accurate simulation of the full chip, including the memory hierarchy and interconnect, and using a diverse set of scientific and engineering workloads.\par We find that the interconnect has a large impact on performance, as it is responsible for 60\% to 75\% of the miss latency. Latency, and not bandwidth, is the primary performance constraint, since, even with many threads per core and workloads with high miss rates, networks with enough bandwidth can be efficiently implemented for the system scales we consider. From the topologies we study, the flattened butterfly consistently outperforms the mesh and fat tree on all workloads, leading to performance advantages of up to 22\%. We also show that considering interconnect and memory hierarchy together when designing large-scale CMPs is crucial, and neglecting either of the two can lead to incorrect conclusions. Finally, the effect of the interconnect on overall performance becomes more important as the number of cores increases, making interconnection choices especially critical when scaling up.", acknowledgement = ack-nhfb, articleno = "4", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924", keywords = "chip multiprocessors; hierarchical networks; Networks-on-chip", } @Article{Sodan:2010:PMM, author = "Angela C. Sodan and Jacob Machina and Arash Deshmeh and Kevin Macnaughton and Bryan Esbaugh", title = "Parallelism via Multithreaded and Multicore {CPUs}", journal = j-COMPUTER, volume = "43", number = "3", pages = "24--32", month = mar, year = "2010", CODEN = "CPTRB4", DOI = "https://doi.org/10.1109/MC.2010.75", ISSN = "0018-9162 (print), 1558-0814 (electronic)", ISSN-L = "0018-9162", bibdate = "Wed May 12 22:57:42 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/computer2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Computer", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2", } @Article{Soundararajan:2010:CSE, author = "Niranjan Soundararajan and Anand Sivasubramaniam and Vijay Narayanan", title = "Characterizing the soft error vulnerability of multicores running multithreaded applications", journal = j-SIGMETRICS, volume = "38", number = "1", pages = "379--380", month = jun, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1811099.1811096", ISSN = "0163-5999 (print), 1557-9484 (electronic)", ISSN-L = "0163-5999", bibdate = "Wed Aug 25 07:35:52 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Multicores have become the platform of choice across all market segments. Cost-effective protection against soft errors is important in these environments, due to the need to move to lower technology generations and the exploding number of transistors on a chip. While multicores offer the flexibility of varying the number of application threads and the number of cores on which they run, the reliability impact of choosing one configuration over another is unclear. Our study reveals that the reliability costs vary dramatically between configurations and being unaware could lead to a sub-optimal choice.", acknowledgement = ack-nhfb, fjournal = "ACM SIGMETRICS Performance Evaluation Review", journal-URL = "http://portal.acm.org/toc.cfm?id=J618", keywords = "fit rate; multicore; soft errors", } @Article{Sutherland:2010:CTC, author = "Dean F. Sutherland and William L. Scherlis", title = "Composable thread coloring", journal = j-SIGPLAN, volume = "45", number = "5", pages = "233--244", month = may, year = "2010", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1693453.1693485", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Aug 31 22:39:18 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This paper introduces the language-independent concept of ``thread usage policy.'' Many multi-threaded software systems contain policies that regulate associations among threads, executable code, and potentially shared state. A system, for example, may constrain which threads are permitted to execute particular code segments, usually as a means to constrain those threads from accessing or writing particular elements of state. These policies ensure properties such as state confinement or reader/writer constraints, often without recourse to locking or transaction discipline.\par Our approach allows developers to concisely document their thread usage policies in a manner that enables the use of sound scalable analysis to assess consistency of policy and as-written code. This paper identifies the key semantic concepts of our thread coloring language and illustrates how to use its succinct source-level annotations to express models of thread usage policies, following established annotation conventions for Java.\par We have built a prototype static analysis tool, implemented as an integrated development environment plug-in (for the Eclipse IDE), that notifies developers of discrepancies between policy annotations and as-written code. Our analysis technique uses several underlying algorithms based on abstract interpretation, call-graphs, and type inference. The resulting overall analysis is both sound and composable. We have used this prototype analysis tool in case studies to model and analyze more than a million lines of code.\par Our validation process included field trials on a wide variety of complex large-scale production code selected by the host organizations. Our in-field experience led us to focus on potential adoptability by real-world developers. We have developed techniques that can reduce annotation density to less than one line per thousand lines of code (KLOC). In addition, the prototype analysis tool supports an incremental and iterative approach to modeling and analysis. This approach enabled field trial partners to directly target areas of greatest concern and to achieve useful results within a few hours.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "annotation; Java; keywords: state consistency; multicore; race conditions; state confinement; thread policy", } @Article{Tallent:2010:ALC, author = "Nathan R. Tallent and John M. Mellor-Crummey and Allan Porterfield", title = "Analyzing lock contention in multithreaded applications", journal = j-SIGPLAN, volume = "45", number = "5", pages = "269--280", month = may, year = "2010", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1693453.1693489", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Aug 31 22:39:18 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Many programs exploit shared-memory parallelism using multithreading. Threaded codes typically use locks to coordinate access to shared data. In many cases, contention for locks reduces parallel efficiency and hurts scalability. Being able to quantify and attribute lock contention is important for understanding where a multithreaded program needs improvement.\par This paper proposes and evaluates three strategies for gaining insight into performance losses due to lock contention. First, we consider using a straightforward strategy based on call stack profiling to attribute idle time and show that it fails to yield insight into lock contention. Second, we consider an approach that builds on a strategy previously used for analyzing idleness in work-stealing computations; we show that this strategy does not yield insight into lock contention. Finally, we propose a new technique for measurement and analysis of lock contention that uses data associated with locks to blame lock holders for the idleness of spinning threads. Our approach incurs $ \leq $ 5\% overhead on a quantum chemistry application that makes extensive use of locking (65M distinct locks, a maximum of 340K live locks, and an average of 30K lock acquisitions per second per thread) and attributes lock contention to its full static and dynamic calling contexts. Our strategy, implemented in HPCToolkit, is fully distributed and should scale well to systems with large core counts.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "HPCToolkit; lock contention; multithreading; performance analysis", } @Article{Tentyukov:2010:MVF, author = "M. Tentyukov and J. A. M. Vermaseren", title = "The multithreaded version of {FORM}", journal = j-COMP-PHYS-COMM, volume = "181", number = "8", pages = "1419--1427", month = aug, year = "2010", CODEN = "CPHCBZ", DOI = "https://doi.org/10.1016/j.cpc.2010.04.009", ISSN = "0010-4655 (print), 1879-2944 (electronic)", ISSN-L = "0010-4655", bibdate = "Sat Feb 11 09:54:30 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.sciencedirect.com/science/article/pii/S0010465510001207", acknowledgement = ack-nhfb, fjournal = "Computer Physics Communications", journal-URL = "http://www.sciencedirect.com/science/journal/00104655", } @Article{Tian:2010:SPU, author = "Chen Tian and Min Feng and Rajiv Gupta", title = "Speculative parallelization using state separation and multiple value prediction", journal = j-SIGPLAN, volume = "45", number = "8", pages = "63--72", month = aug, year = "2010", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1806651.1806663", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Fri Oct 8 17:55:48 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "With the availability of chip multiprocessor (CMP) and simultaneous multithreading (SMT) machines, extracting thread level parallelism from a sequential program has become crucial for improving performance. However, many sequential programs cannot be easily parallelized due to the presence of dependences. To solve this problem, different solutions have been proposed. Some of them make the optimistic assumption that such dependences rarely manifest themselves at runtime. However, when this assumption is violated, the recovery causes very large overhead. Other approaches incur large synchronization or computation overhead when resolving the dependences. Consequently, for a loop with frequently arising cross-iteration dependences, previous techniques are not able to speed up the execution. In this paper we propose a compiler technique which uses state separation and multiple value prediction to speculatively parallelize loops in sequential programs that contain frequently arising cross-iteration dependences. The key idea is to generate multiple versions of a loop iteration based on multiple predictions of values of variables involved in cross-iteration dependences (i.e., live-in variables). These speculative versions and the preceding loop iteration are executed in separate memory states simultaneously. After the execution, if one of these versions is correct (i.e., its predicted values are found to be correct), then we merge its state and the state of the preceding iteration because the dependence between the two iterations is correctly resolved. The memory states of other incorrect versions are completely discarded. Based on this idea, we further propose a runtime adaptive scheme that not only gives a good performance but also achieves better CPU utilization. We conducted experiments on 10 benchmark programs on a real machine. The results show that our technique can achieve 1.7x speedup on average across all used benchmarks.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "multicore processors; speculative parallelization", } @Article{Torlak:2010:MCA, author = "Emina Torlak and Mandana Vaziri and Julian Dolby", title = "{MemSAT}: checking axiomatic specifications of memory models", journal = j-SIGPLAN, volume = "45", number = "6", pages = "341--350", month = jun, year = "2010", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1806596.1806635", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Fri Oct 8 17:53:18 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Memory models are hard to reason about due to their complexity, which stems from the need to strike a balance between ease-of-programming and allowing compiler and hardware optimizations. In this paper, we present an automated tool, MemSAT, that helps in debugging and reasoning about memory models. Given an axiomatic specification of a memory model and a multi-threaded test program containing assertions, MemSAT outputs a trace of the program in which both the assertions and the memory model axioms are satisfied, if one can be found. The tool is fully automatic and is based on a SAT solver. If it cannot find a trace, it outputs a minimal subset of the memory model and program constraints that are unsatisfiable. We used MemSAT to check several existing memory models against their published test cases, including the current Java Memory Model by Manson et al. and a revised version of it by Sevcik and Aspinall. We found subtle discrepancies between what was expected and the actual results of test programs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "axiomatic specifications; bounded model checking; memory models; sat", } @Article{Trott:2010:AVI, author = "Oleg Trott and Arthur J. Olson", title = "{AutoDock Vina}: {Improving} the speed and accuracy of docking with a new scoring function, efficient optimization, and multithreading", journal = j-J-COMPUT-CHEM, volume = "31", number = "2", pages = "455--461", day = "30", month = jan, year = "2010", CODEN = "JCCHDD", DOI = "https://doi.org/10.1002/jcc.21334", ISSN = "0192-8651 (print), 1096-987X (electronic)", ISSN-L = "0192-8651", bibdate = "Thu Nov 29 14:55:23 MST 2012", bibsource = "http://www.interscience.wiley.com/jpages/0192-8651; https://www.math.utah.edu/pub/tex/bib/jcomputchem2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Journal of Computational Chemistry", journal-URL = "http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1096-987X", onlinedate = "4 Jun 2009", } @Article{Vlachos:2010:PEAa, author = "Evangelos Vlachos and Michelle L. Goodstein and Michael A. Kozuch and Shimin Chen and Babak Falsafi and Phillip B. Gibbons and Todd C. Mowry", title = "{ParaLog}: enabling and accelerating online parallel monitoring of multithreaded applications", journal = j-COMP-ARCH-NEWS, volume = "38", number = "1", pages = "271--284", month = mar, year = "2010", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Mar 17 14:42:04 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Vlachos:2010:PEAb, author = "Evangelos Vlachos and Michelle L. Goodstein and Michael A. Kozuch and Shimin Chen and Babak Falsafi and Phillip B. Gibbons and Todd C. Mowry", title = "{ParaLog}: enabling and accelerating online parallel monitoring of multithreaded applications", journal = j-SIGPLAN, volume = "45", number = "3", pages = "271--284", month = mar, year = "2010", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1736020.1736051", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Mar 17 13:46:56 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "{\em Instruction-grain lifeguards\/} monitor the events of a running application at the level of individual instructions in order to identify and help mitigate application bugs and security exploits. Because such lifeguards impose a 10-100X slowdown on existing platforms, previous studies have proposed hardware designs to accelerate lifeguard processing. However, these accelerators are either tailored to a specific class of lifeguards or suitable only for monitoring single-threaded programs.\par We present ParaLog, the first design of a system enabling fast online parallel monitoring of multithreaded parallel applications. ParaLog supports a broad class of software-defined lifeguards. We show how three existing accelerators can be enhanced to support online multithreaded monitoring, dramatically reducing lifeguard overheads. We identify and solve several challenges in monitoring parallel applications and/or parallelizing these accelerators, including (i) enforcing inter-thread data dependences, (ii) dealing with inter-thread effects that are not reflected in coherence traffic, (iii) dealing with unmonitored operating system activity, and (iv) ensuring lifeguards can access shared metadata with negligible synchronization overheads. We present our system design for both Sequentially Consistent and Total Store Ordering processors. We implement and evaluate our design on a 16 core simulated CMP, using benchmarks from SPLASH-2 and PARSEC and two lifeguards: a data-flow tracking lifeguard and a memory-access checker lifeguard. Our results show that (i) our parallel accelerators improve performance by 2-9X and 1.13-3.4X for our two lifeguards, respectively, (ii) we are 5-126X faster than the time-slicing approach required by existing techniques, and (iii) our average overheads for applications with eight threads are 51\% and 28\% for the two lifeguards, respectively.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "hardware support for debugging; instruction-grain lifeguards; online parallel monitoring", } @Article{Welch:2010:SCF, author = "Peter H. Welch and Jan B. Pedersen", title = "{Santa Claus}: {Formal} analysis of a process-oriented solution", journal = j-TOPLAS, volume = "32", number = "4", pages = "14:1--14:37", month = apr, year = "2010", CODEN = "ATPSDT", DOI = "https://doi.org/10.1145/1734206.1734211", ISSN = "0164-0925 (print), 1558-4593 (electronic)", ISSN-L = "0164-0925", bibdate = "Fri May 21 12:47:03 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/toplas/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "With the commercial development of multicore processors, the challenges of writing multithreaded programs to take advantage of these new hardware architectures are becoming more and more pertinent. Concurrent programming is necessary to achieve the performance that the hardware offers. Traditional approaches present concurrency as an {\em advanced\/} topic: they have proven difficult to use, reason about with confidence, and scale up to high levels of concurrency. This article reviews {\em process-oriented design}, based on Hoare's algebra of Communicating Sequential Processes (CSP), and proposes that this approach to concurrency leads to solutions that are manageable by novice programmers; that is, they are easy to design and maintain, that they are scalable for complexity, {\em obviously correct}, and relatively easy to verify using formal reasoning and/or model checkers. These solutions can be developed in conventional programming languages (through CSP libraries) or specialized ones (such as occam-\pi) in a manner that directly reflects their formal expression. Systems can be developed without needing specialist knowledge of the CSP formalism, since the supporting mathematics is burnt into the tools and languages supporting it. We illustrate these concepts with the {\em Santa Claus problem}, which has been used as a challenge for concurrency mechanisms since 1994. We consider this problem as an example control system, producing external signals reporting changes of internal state (that model the external world). We claim our occam-\pi solution is {\em correct-by-design}, but follow this up with formal verification (using the FDR model checker for CSP) that the system is free from deadlock and livelock, that the produced control signals obey crucial ordering constraints, and that the system has key liveness properties.", acknowledgement = ack-nhfb, articleno = "14", fjournal = "ACM Transactions on Programming Languages and Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783", keywords = "concurrency; CSP; deadlock; event ordering; liveness; novice programmer; occam-pi; Process orientation; verification", } @Article{Wendykier:2010:PCH, author = "Piotr Wendykier and James G. Nagy", title = "{Parallel Colt}: a High-Performance {Java} Library for Scientific Computing and Image Processing", journal = j-TOMS, volume = "37", number = "3", pages = "31:1--31:22", month = sep, year = "2010", CODEN = "ACMSCU", DOI = "https://doi.org/10.1145/1824801.1824809", ISSN = "0098-3500 (print), 1557-7295 (electronic)", ISSN-L = "0098-3500", bibdate = "Mon Sep 27 10:15:50 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Major breakthroughs in chip and software design have been observed for the last nine years. In October 2001, IBM released the world's first multicore processor: POWER4. Six years later, in February 2007, NVIDIA made a public release of CUDA SDK, a set of development tools to write algorithms for execution on Graphic Processing Units (GPUs). Although software vendors have started working on parallelizing their products, the vast majority of existing code is still sequential and does not effectively utilize modern multicore CPUs and manycore GPUs.\par This article describes Parallel Colt, a multithreaded Java library for scientific computing and image processing. In addition to describing the design and functionality of Parallel Colt, a comparison to MATLAB is presented. Two ImageJ plugins for iterative image deblurring and motion correction of PET brain images are described as typical applications of this library. Performance comparisons with MATLAB, including GPU computations via AccelerEyes' Jacket toolbox are also given.", acknowledgement = ack-nhfb, articleno = "31", fjournal = "ACM Transactions on Mathematical Software (TOMS)", journal-URL = "http://dl.acm.org/pub.cfm?id=J782", keywords = "Deconvolution; FFT; inverse problems; iterative methods; motion correction; multithreading; PET; regularization", } @Article{Wheeler:2010:VMM, author = "Kyle B. Wheeler and Douglas Thain", title = "Visualizing massively multithreaded applications with {ThreadScope}", journal = j-CCPE, volume = "22", number = "1", pages = "45--67", month = jan, year = "2010", CODEN = "CCPEBO", DOI = "https://doi.org/10.1002/cpe.1469", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Mon Dec 5 10:08:40 MST 2011", bibsource = "http://www.interscience.wiley.com/jpages/1532-0626; https://www.math.utah.edu/pub/tex/bib/ccpe.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Concurrency and Computation: Prac\-tice and Experience", journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626", onlinedate = "13 Aug 2009", } @Article{Yi:2010:NAS, author = "Kyueun Yi and J.-L. Gaudiot", title = "Network Applications on Simultaneous Multithreading Processors", journal = j-IEEE-TRANS-COMPUT, volume = "59", number = "9", pages = "1200--1209", month = sep, year = "2010", CODEN = "ITCOB4", DOI = "https://doi.org/10.1109/TC.2009.185", ISSN = "0018-9340 (print), 1557-9956 (electronic)", ISSN-L = "0018-9340", bibdate = "Sun Jul 3 11:52:32 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=5374374", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Computers", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12", } @Article{Zhang:2010:DCS, author = "Eddy Z. Zhang and Yunlian Jiang and Xipeng Shen", title = "Does cache sharing on modern {CMP} matter to the performance of contemporary multithreaded programs?", journal = j-SIGPLAN, volume = "45", number = "5", pages = "203--212", month = may, year = "2010", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1693453.1693482", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Aug 31 22:39:18 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Most modern Chip Multiprocessors (CMP) feature shared cache on chip. For multithreaded applications, the sharing reduces communication latency among co-running threads, but also results in cache contention.\par A number of studies have examined the influence of cache sharing on multithreaded applications, but most of them have concentrated on the design or management of shared cache, rather than a systematic measurement of the influence. Consequently, prior measurements have been constrained by the reliance on simulators, the use of out-of-date benchmarks, and the limited coverage of deciding factors. The influence of CMP cache sharing on contemporary multithreaded applications remains preliminarily understood.\par In this work, we conduct a systematic measurement of the influence on two kinds of commodity CMP machines, using a recently released CMP benchmark suite, PARSEC, with a number of potentially important factors on program, OS, and architecture levels considered. The measurement shows some surprising results. Contrary to commonly perceived importance of cache sharing, neither positive nor negative effects from the cache sharing are significant for most of the program executions, regardless of the types of parallelism, input datasets, architectures, numbers of threads, and assignments of threads to cores. After a detailed analysis, we find that the main reason is the mismatch of current development and compilation of multithreaded applications and CMP architectures. By transforming the programs in a cache-sharing-aware manner, we observe up to 36\% performance increase when the threads are placed on cores appropriately.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "chip multiprocessors; parallel program optimizations; shared cache; thread scheduling", } @Article{Zhang:2010:FTS, author = "Yao Zhang and Jonathan Cohen and John D. Owens", title = "Fast tridiagonal solvers on the {GPU}", journal = j-SIGPLAN, volume = "45", number = "5", pages = "127--136", month = may, year = "2010", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1693453.1693472", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Aug 31 22:39:18 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "We study the performance of three parallel algorithms and their hybrid variants for solving tridiagonal linear systems on a GPU: cyclic reduction (CR), parallel cyclic reduction (PCR) and recursive doubling (RD). We develop an approach to measure, analyze, and optimize the performance of GPU programs in terms of memory access, computation, and control overhead. We find that CR enjoys linear algorithm complexity but suffers from more algorithmic steps and bank conflicts, while PCR and RD have fewer algorithmic steps but do more work each step. To combine the benefits of the basic algorithms, we propose hybrid CR+PCR and CR+RD algorithms, which improve the performance of PCR, RD and CR by 21\%, 31\% and 61\% respectively. Our GPU solvers achieve up to a 28x speedup over a sequential LAPACK solver, and a 12x speedup over a multi-threaded CPU solver.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "GPGPU; performance optimization; tridiagonal linear system", } @Article{Zier:2010:PED, author = "David A. Zier and Ben Lee", title = "Performance Evaluation of Dynamic Speculative Multithreading with the {Cascadia} Architecture", journal = j-IEEE-TRANS-PAR-DIST-SYS, volume = "21", number = "1", pages = "47--59", month = jan, year = "2010", CODEN = "ITDSEO", DOI = "https://doi.org/10.1109/TPDS.2009.47", ISSN = "1045-9219 (print), 1558-2183 (electronic)", ISSN-L = "1045-9219", bibdate = "Thu May 13 12:06:56 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Parallel and Distributed Systems", journal-URL = "http://www.computer.org/tpds/archives.htm", } @Article{Bajaj:2011:FFP, author = "Chandrajit L. Bajaj and Rezaul Chowdhury and Vinay Siddahanavalli", title = "{$ F^2 $Dock}: Fast {Fourier} Protein-Protein Docking", journal = j-TCBB, volume = "8", number = "1", pages = "45--58", month = jan, year = "2011", CODEN = "ITCBCY", DOI = "https://doi.org/10.1109/TCBB.2009.57", ISSN = "1545-5963 (print), 1557-9964 (electronic)", ISSN-L = "1545-5963", bibdate = "Mon Dec 20 18:39:04 MST 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The functions of proteins are often realized through their mutual interactions. Determining a relative transformation for a pair of proteins and their conformations which form a stable complex, reproducible in nature, is known as docking. It is an important step in drug design, structure determination, and understanding function and structure relationships. In this paper, we extend our nonuniform fast Fourier transform-based docking algorithm to include an adaptive search phase (both translational and rotational) and thereby speed up its execution. We have also implemented a multithreaded version of the adaptive docking algorithm for even faster execution on multicore machines. We call this protein-protein docking code {\rm F}^2Dock (F^2= {\rm \underline{F}ast\underline{F}ourier}).", acknowledgement = ack-nhfb, fjournal = "IEEE/ACM Transactions on Computational Biology and Bioinformatics", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J954", } @Article{Ball:2011:PPT, author = "Thomas Ball and Sebastian Burckhardt and Peli de Halleux and Madan Musuvathi and Shaz Qadeer", title = "Predictable and Progressive Testing of Multithreaded Code", journal = j-IEEE-SOFTWARE, volume = "28", number = "3", pages = "75--83", month = may # "\slash " # jun, year = "2011", CODEN = "IESEDJ", DOI = "https://doi.org/10.1109/MS.2010.64", ISSN = "0740-7459 (print), 0740-7459 (electronic)", ISSN-L = "0740-7459", bibdate = "Thu Apr 28 08:41:06 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeesoft.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Software", journal-URL = "http://www.computer.org/portal/web/csdl/magazines/software", } @Article{Bientinesi:2011:CFS, author = "Paolo Bientinesi and Francisco D. Igual and Daniel Kressner and Matthias Petschow and Enrique S. Quintana-Ort{\'\i}", title = "Condensed forms for the symmetric eigenvalue problem on multi-threaded architectures", journal = j-CCPE, volume = "23", number = "7", pages = "694--707", month = may, year = "2011", CODEN = "CCPEBO", DOI = "https://doi.org/10.1002/cpe.1680", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Mon Dec 5 10:08:55 MST 2011", bibsource = "http://www.interscience.wiley.com/jpages/1532-0626; https://www.math.utah.edu/pub/tex/bib/ccpe.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Concurrency and Computation: Practice and Experience", journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626", onlinedate = "8 Nov 2010", } @Article{Burnim:2011:SCSa, author = "Jacob Burnim and George Necula and Koushik Sen", title = "Specifying and checking semantic atomicity for multithreaded programs", journal = j-COMP-ARCH-NEWS, volume = "39", number = "1", pages = "79--90", month = mar, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/1961295.1950377", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Aug 18 13:45:25 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Burnim:2011:SCSb, author = "Jacob Burnim and George Necula and Koushik Sen", title = "Specifying and checking semantic atomicity for multithreaded programs", journal = j-SIGPLAN, volume = "46", number = "3", pages = "79--90", month = mar, year = "2011", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1961296.1950377", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue May 24 10:55:08 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "ASPLOS '11 conference proceedings", } @Article{Butler:2011:BAM, author = "Michael Butler and Leslie Barnes and Debjit Das Sarma and Bob Gelinas", title = "{Bulldozer}: An Approach to Multithreaded Compute Performance", journal = j-IEEE-MICRO, volume = "31", number = "2", pages = "6--15", month = mar # "\slash " # apr, year = "2011", CODEN = "IEMIDZ", DOI = "https://doi.org/10.1109/MM.2011.23", ISSN = "0272-1732 (print), 1937-4143 (electronic)", ISSN-L = "0272-1732", bibdate = "Tue Apr 26 13:50:28 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "AMD's Bulldozer module represents a new direction in microarchitecture and includes a number of firsts for AMD, including AMD's multithreaded x86 processor, implementation of a shared Level 2 cache, and x86 processor to incorporate floating-point multiply-accumulate (FMAC). This article discusses the module's multithreading architecture, power-efficient microarchitecture, and subblocks, including the various microarchitectural latencies, bandwidths, and structure sizes.", acknowledgement = ack-nhfb, fjournal = "IEEE Micro", journal-URL = "http://www.computer.org/csdl/mags/mi/index.html", keywords = "Hot Chips 22 conference proceedings", } @Article{Chen:2011:MJP, author = "Kuo-Yi Chen and J. Morris Chang and Ting-Wei Hou", title = "Multithreading in {Java}: Performance and Scalability on Multicore Systems", journal = j-IEEE-TRANS-COMPUT, volume = "60", number = "11", pages = "1521--1534", month = nov, year = "2011", CODEN = "ITCOB4", DOI = "https://doi.org/10.1109/TC.2010.232", ISSN = "0018-9340 (print), 1557-9956 (electronic)", ISSN-L = "0018-9340", bibdate = "Tue Sep 27 07:57:50 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput.bib; https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib; https://www.math.utah.edu/pub/tex/bib/java2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=5661769", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Computers", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12", } @Article{Chinya:2011:BDP, author = "Gautham N. Chinya and Jamison D. Collins and Perry H. Wang and Hong Jiang and Guei-Yuan Lueh and Thomas A. Piazza and Hong Wang", title = "{Bothnia}: a dual-personality extension to the {Intel} integrated graphics driver", journal = j-OPER-SYS-REV, volume = "45", number = "1", pages = "11--20", month = jan, year = "2011", CODEN = "OSRED8", DOI = "https://doi.org/10.1145/1945023.1945027", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Fri Feb 25 16:43:23 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "In this paper, we introduce Bothnia, an extension to the Intel production graphics driver to support a shared virtual memory heterogeneous multithreading programming model. With Bothnia, the Intel graphics device driver can support both the traditional 3D graphics rendering software stack and a new class of heterogeneous multithreaded applications, which can use both IA (Intel Architecture) CPU cores and Intel integrated Graphics and Media Accelerator (GMA) cores in the same virtual address space. We describe the necessary architectural supports in both IA CPU and the GMA cores and present a reference Bothnia implementation.", acknowledgement = ack-nhfb, fjournal = "ACM SIGOPS Operating Systems Review", } @Article{Davis:2011:ASM, author = "Timothy A. Davis", title = "{Algorithm 915}, {SuiteSparseQR}: {Multifrontal} multithreaded rank-revealing sparse {QR} factorization", journal = j-TOMS, volume = "38", number = "1", pages = "8:1--8:22", month = nov, year = "2011", CODEN = "ACMSCU", DOI = "https://doi.org/10.1145/2049662.2049670", ISSN = "0098-3500 (print), 1557-7295 (electronic)", ISSN-L = "0098-3500", bibdate = "Thu Dec 15 08:59:34 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/toms.bib", abstract = "SuiteSparseQR is a sparse QR factorization package based on the multifrontal method. Within each frontal matrix, LAPACK and the multithreaded BLAS enable the method to obtain high performance on multicore architectures. Parallelism across different frontal matrices is handled with Intel's Threading Building Blocks library. The symbolic analysis and ordering phase pre-eliminates singletons by permuting the input matrix A into the form [R11 R12; 0 A22] where R11 is upper triangular with diagonal entries above a given tolerance. Next, the fill-reducing ordering, column elimination tree, and frontal matrix structures are found without requiring the formation of the pattern of ATA. Approximate rank-detection is performed within each frontal matrix using Heath's method.", acknowledgement = ack-nhfb, articleno = "8", fjournal = "ACM Transactions on Mathematical Software (TOMS)", journal-URL = "http://dl.acm.org/pub.cfm?id=J782", } @Article{Esparza:2011:CPB, author = "Javier Esparza and Pierre Ganty", title = "Complexity of pattern-based verification for multithreaded programs", journal = j-SIGPLAN, volume = "46", number = "1", pages = "499--510", month = jan, year = "2011", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1925844.1926443", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jan 26 15:06:39 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Feinbube:2011:JFM, author = "Frank Feinbube and Peter Troger and Andreas Polze", title = "Joint Forces: From Multithreaded Programming to {GPU} Computing", journal = j-IEEE-SOFTWARE, volume = "28", number = "1", pages = "51--57", month = jan # "\slash " # feb, year = "2011", CODEN = "IESOEG", DOI = "https://doi.org/10.1109/MS.2010.134", ISSN = "0740-7459 (print), 0740-7459 (electronic)", ISSN-L = "0740-7459", bibdate = "Thu Dec 23 16:29:15 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeesoft.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Software", journal-URL = "http://www.computer.org/portal/web/csdl/magazines/software", } @InProceedings{Ganesan:2011:MMP, author = "Karthik Ganesan and Lizy K. John", title = "{MAximum Multicore POwer (MAMPO)}: an automatic multithreaded synthetic power virus generation framework for multicore systems", crossref = "Lathrop:2011:SPI", pages = "53:1--53:12", year = "2011", DOI = "https://doi.org/10.1145/2063384.2063455", bibdate = "Fri Dec 16 11:05:47 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/supercomputing2011.bib", acknowledgement = ack-nhfb, articleno = "53", } @Article{Gupta:2011:PAR, author = "Ashutosh Gupta and Corneliu Popeea and Andrey Rybalchenko", title = "Predicate abstraction and refinement for verifying multi-threaded programs", journal = j-SIGPLAN, volume = "46", number = "1", pages = "331--344", month = jan, year = "2011", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1925844.1926424", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jan 26 15:06:39 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Hong:2011:AMA, author = "Bo Hong and Zhengyu He", title = "An Asynchronous Multithreaded Algorithm for the Maximum Network Flow Problem with Nonblocking Global Relabeling Heuristic", journal = j-IEEE-TRANS-PAR-DIST-SYS, volume = "22", number = "6", pages = "1025--1033", month = jun, year = "2011", CODEN = "ITDSEO", DOI = "https://doi.org/10.1109/TPDS.2010.156", ISSN = "1045-9219 (print), 1558-2183 (electronic)", ISSN-L = "1045-9219", bibdate = "Fri Jul 22 07:53:43 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Parallel and Distributed Systems", journal-URL = "http://www.computer.org/tpds/archives.htm", } @Article{Hsu:2011:MSS, author = "Chia-Jui Hsu and Jos{\'e} Luis Pino and Shuvra S. Bhattacharyya", title = "Multithreaded Simulation for Synchronous Dataflow Graphs", journal = j-TODAES, volume = "16", number = "3", pages = "25:1--25:??", month = jun, year = "2011", CODEN = "ATASFO", DOI = "https://doi.org/10.1145/1970353.1970358", ISSN = "1084-4309 (print), 1557-7309 (electronic)", ISSN-L = "1084-4309", bibdate = "Tue Jun 14 11:55:50 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/todaes/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "For system simulation, Synchronous DataFlow (SDF) has been widely used as a core model of computation in design tools for digital communication and signal processing systems. The traditional approach for simulating SDF graphs is to compute and execute static schedules in single-processor desktop environments. Nowadays, however, multicore processors are increasingly popular desktop platforms for their potential performance improvements through thread-level parallelism. Without novel scheduling and simulation techniques that explicitly explore thread-level parallelism for executing SDF graphs, current design tools gain only minimal performance improvements on multicore platforms. In this article, we present a new multithreaded simulation scheduler, called MSS, to provide simulation runtime speedup for executing SDF graphs on multicore processors.", acknowledgement = ack-nhfb, articleno = "25", fjournal = "ACM Transactions on Design Automation of Electronic Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J776", } @Article{Jeffrey:2011:IBM, author = "Dennis Jeffrey and Yan Wang and Chen Tian and Rajiv Gupta", title = "Isolating bugs in multithreaded programs using execution suppression", journal = j-SPE, volume = "41", number = "11", pages = "1259--1288", month = oct, year = "2011", CODEN = "SPEXBL", DOI = "https://doi.org/10.1002/spe.1040", ISSN = "0038-0644 (print), 1097-024X (electronic)", ISSN-L = "0038-0644", bibdate = "Thu Sep 29 14:49:13 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/spe.bib", acknowledgement = ack-nhfb, fjournal = "Software --- Practice and Experience", journal-URL = "http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1097-024X", onlinedate = "18 Jan 2011", } @Article{Joisha:2011:TEA, author = "Pramod G. Joisha and Robert S. Schreiber and Prithviraj Banerjee and Hans J. Boehm and Dhruva R. Chakrabarti", title = "A technique for the effective and automatic reuse of classical compiler optimizations on multithreaded code", journal = j-SIGPLAN, volume = "46", number = "1", pages = "623--636", month = jan, year = "2011", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1925844.1926457", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jan 26 15:06:39 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Li:2011:FSM, author = "Guodong Li and Robert Palmer and Michael DeLisi and Ganesh Gopalakrishnan and Robert M. Kirby", title = "Formal specification of {MPI 2.0}: {Case} study in specifying a practical concurrent programming {API}", journal = j-SCI-COMPUT-PROGRAM, volume = "76", number = "2", pages = "65--81", day = "1", month = feb, year = "2011", CODEN = "SCPGD4", ISSN = "0167-6423 (print), 1872-7964 (electronic)", ISSN-L = "0167-6423", bibdate = "Fri Apr 1 18:39:40 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/scicomputprogram.bib; http://www.sciencedirect.com/science/journal/01676423", acknowledgement = ack-nhfb, fjournal = "Science of Computer Programming", journal-URL = "http://www.sciencedirect.com/science/journal/01676423/", } @Article{Li:2011:LCM, author = "Sheng Li and Shannon Kuntz and Jay B. Brockman and Peter M. Kogge", title = "{Lightweight Chip Multi-Threading (LCMT)}: Maximizing Fine-Grained Parallelism On-Chip", journal = j-IEEE-TRANS-PAR-DIST-SYS, volume = "22", number = "7", pages = "1178--1191", month = jul, year = "2011", CODEN = "ITDSEO", DOI = "https://doi.org/10.1109/TPDS.2010.169", ISSN = "1045-9219 (print), 1558-2183 (electronic)", ISSN-L = "1045-9219", bibdate = "Fri Jul 22 07:54:38 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Parallel and Distributed Systems", journal-URL = "http://www.computer.org/tpds/archives.htm", } @Article{Liao:2011:AUB, author = "Xiongfei Liao and Thambipillai Srikanthan", title = "Accelerating {UNISIM}-Based Cycle-Level Microarchitectural Simulations on Multicore Platforms", journal = j-TODAES, volume = "16", number = "3", pages = "26:1--26:??", month = jun, year = "2011", CODEN = "ATASFO", DOI = "https://doi.org/10.1145/1970353.1970359", ISSN = "1084-4309 (print), 1557-7309 (electronic)", ISSN-L = "1084-4309", bibdate = "Tue Jun 14 11:55:50 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/todaes/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "UNISIM has been shown to ease the development of simulators for multi-/many-core systems. However, UNISIM cycle-level simulations of large-scale multiprocessor systems could be very time consuming. In this article, we propose a systematic framework for accelerating UNISIM cycle-level simulations on multicore platforms. The proposed framework relies on exploiting the fine-grained parallelism within the simulated cycles using POSIX threads. A multithreaded simulation engine has been devised from the single-threaded UNISIM SystemC engine to facilitate the exploitation of inherent parallelism. An adaptive technique that manages the overall computation workload by adjusting the number of threads employed at any given time is proposed. In addition, we have introduced a technique to balance the workloads of multithreaded executions.", acknowledgement = ack-nhfb, articleno = "26", fjournal = "ACM Transactions on Design Automation of Electronic Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J776", } @Article{Ma:2011:SPC, author = "Kai Ma and Xue Li and Ming Chen and Xiaorui Wang", title = "Scalable power control for many-core architectures running multi-threaded applications", journal = j-COMP-ARCH-NEWS, volume = "39", number = "3", pages = "449--460", month = jun, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2024723.2000117", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 5 17:15:11 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Mahafzah:2011:PMI, author = "Basel A. Mahafzah", title = "Parallel multithreaded {IDA*} heuristic search: algorithm design and performance evaluation", journal = j-INT-J-PAR-EMER-DIST-SYS, volume = "26", number = "1", pages = "61--82", year = "2011", CODEN = "????", DOI = "https://doi.org/10.1080/17445761003604521", ISSN = "1744-5760 (print), 1744-5779 (electronic)", ISSN-L = "1744-5760", bibdate = "Mon Sep 5 20:33:09 MDT 2011", bibsource = "http://www.informaworld.com/smpp/title~content=t713729127~link=cover; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, journal-URL = "http://www.tandfonline.com/loi/gpaa20", onlinedate = "6 Dec 2010", } @Article{Marino:2011:CSP, author = "Daniel Marino and Abhayendra Singh and Todd Millstein and Madanlal Musuvathi and Satish Narayanasamy", title = "A case for an {SC}-preserving compiler", journal = j-SIGPLAN, volume = "46", number = "6", pages = "199--210", month = jun, year = "2011", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/1993316.1993522", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Thu Jun 9 10:23:33 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The most intuitive memory consistency model for shared-memory multi-threaded programming is sequential consistency (SC). However, current concurrent programming languages support a relaxed model, as such relaxations are deemed necessary for enabling important optimizations. This paper demonstrates that an SC-preserving compiler, one that ensures that every SC behavior of a compiler-generated binary is an SC behavior of the source program, retains most of the performance benefits of an optimizing compiler. The key observation is that a large class of optimizations crucial for performance are either already SC-preserving or can be modified to preserve SC while retaining much of their effectiveness. An SC-preserving compiler, obtained by restricting the optimization phases in LLVM, a state-of-the-art C/C++ compiler, incurs an average slowdown of 3.8\% and a maximum slowdown of 34\% on a set of 30 programs from the SPLASH-2, PARSEC, and SPEC CINT2006 benchmark suites.\par While the performance overhead of preserving SC in the compiler is much less than previously assumed, it might still be unacceptable for certain applications. We believe there are several avenues for improving performance without giving up SC-preservation. In this vein, we observe that the overhead of our SC-preserving compiler arises mainly from its inability to aggressively perform a class of optimizations we identify as eager-load optimizations. This class includes common-subexpression elimination, constant propagation, global value numbering, and common cases of loop-invariant code motion. We propose a notion of interference checks in order to enable eager-load optimizations while preserving SC. Interference checks expose to the compiler a commonly used hardware speculation mechanism that can efficiently detect whether a particular variable has changed its value since last read.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", keywords = "LLVM compiler suite; sequential consistency (SC)", } @InProceedings{Preissl:2011:MGA, author = "Robert Preissl and Nathan Wichmann and Bill Long and John Shalf and Stephane Ethier and Alice Koniges", title = "Multithreaded Global Address Space Communication Techniques for Gyrokinetic Fusion Applications on Ultra-Scale Platforms", crossref = "Lathrop:2011:SPI", pages = "12:1--12:11", year = "2011", DOI = "https://doi.org/10.1145/2063384.2071033", bibdate = "Fri Dec 16 11:05:47 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/supercomputing2011.bib", acknowledgement = ack-nhfb, } @Article{Prieto:2011:MCM, author = "Pablo Prieto and Valentin Puente and Jose-Angel Gregorio", title = "Multilevel Cache Modeling for Chip-Multiprocessor Systems", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "10", number = "2", pages = "49--52", month = jul # "\slash " # dec, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2011.20", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Jun 20 17:18:18 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This paper presents a simple analytical model for predicting on-chip cache hierarchy effectiveness in chip multiprocessors (CMP) for a state-of-the-art architecture. Given the complexity of this type of systems, we use rough approximations, such as the empirical observation that the re-reference timing pattern follows a power law and the assumption of a simplistic delay model for the cache, in order to provide a useful model for the memory hierarchy responsiveness. This model enables the analytical determination of average access time, which makes design space pruning useful before sweeping the vast design space of this class of systems. The model is also useful for predicting cache hierarchy behavior in future systems. The fidelity of the model has been validated using a state-of-the-art, full-system simulation environment, on a system with up to sixteen out-of-order processors with cache-coherent caches and using a broad spectrum of applications, including complex multithread workloads. This simple model can predict a near-to-optimal, on-chip cache distribution while also estimating how future systems running future applications might behave.", acknowledgement = ack-nhfb, affiliation = "Prieto, P (Reprint Author), Univ Cantabria, Cantabria, Spain. Prieto, Pablo; Puente, Valentin; Gregorio, Jose-Angel, Univ Cantabria, Cantabria, Spain.", author-email = "prietop@unican.es vpuente@unican.es monaster@unican.es", da = "2019-06-20", doc-delivery-number = "855NW", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Spanish Ministry of Science and Innovation [TIN2010-18159]; HiPEAC2 European Network of Excellence", funding-text = "This work has been supported by the Spanish Ministry of Science and Innovation, under contracts TIN2010-18159, and by the HiPEAC2 European Network of Excellence. The authors would like to thank the reviewers for their valuable comments.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", number-of-cited-references = "13", ORCID-numbers = "Prieto, Pablo/0000-0002-5818-1188 Puente, Valentin/0000-0002-6904-3282 Gregorio, Jose Angel/0000-0003-2214-303X", research-areas = "Computer Science", times-cited = "3", unique-id = "Prieto:2011:MCM", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Reddy:2011:BFH, author = "Dheeraj Reddy and David Koufaty and Paul Brett and Scott Hahn", title = "Bridging functional heterogeneity in multicore architectures", journal = j-OPER-SYS-REV, volume = "45", number = "1", pages = "21--33", month = jan, year = "2011", CODEN = "OSRED8", DOI = "https://doi.org/10.1145/1945023.1945028", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Fri Feb 25 16:43:23 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Heterogeneous processors that mix big high performance cores with small low power cores promise excellent single-threaded performance coupled with high multi-threaded throughput and higher performance-per-watt. A significant portion of the commercial multicore heterogeneous processors are likely to have a common instruction set architecture( ISA). However, due to limited design resources and goals, each core is likely to contain ISA extensions not yet implemented in the other core. Therefore, such heterogeneous processors will have inherent functional asymmetry at the ISA level and face significant software challenges. This paper analyzes the software challenges to the operating system and the application layer software on a heterogeneous system with functional asymmetry, where the ISA of the small and big cores overlaps.", acknowledgement = ack-nhfb, fjournal = "ACM SIGOPS Operating Systems Review", } @Article{Roy:2011:SRP, author = "Soumyaroop Roy and Nagarajan Ranganathan and Srinivas Katkoori", title = "State-Retentive Power Gating of Register Files in Multicore Processors Featuring Multithreaded In-Order Cores", journal = j-IEEE-TRANS-COMPUT, volume = "60", number = "11", pages = "1547--1560", month = nov, year = "2011", CODEN = "ITCOB4", DOI = "https://doi.org/10.1109/TC.2010.249", ISSN = "0018-9340 (print), 1557-9956 (electronic)", ISSN-L = "0018-9340", bibdate = "Tue Sep 27 07:57:50 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=5669257", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Computers", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12", } @Article{Schonherr:2011:MTI, author = "M. Sch{\"o}nherr and K. Kucher and M. Geier and M. Stiebler and S. Freudiger and M. Krafczyk", title = "Multi-thread implementations of the lattice {Boltzmann} method on non-uniform grids for {CPUs} and {GPUs}", journal = j-COMPUT-MATH-APPL, volume = "61", number = "12", pages = "3730--3743", month = jun, year = "2011", CODEN = "CMAPDK", ISSN = "0898-1221 (print), 1873-7668 (electronic)", ISSN-L = "0898-1221", bibdate = "Wed Mar 1 21:50:48 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/computmathappl2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.sciencedirect.com/science/article/pii/S0898122111002999", acknowledgement = ack-nhfb, fjournal = "Computers and Mathematics with Applications", journal-URL = "http://www.sciencedirect.com/science/journal/08981221", } @Article{Tu:2011:MBM, author = "Xuping Tu and Hai Jin and Zhibin Yu and Jie Chen and Yabin Hu and Xie Xia", title = "{MT-BTRIMER}: A master-slave multi-threaded dynamic binary translator", journal = j-INT-J-COMPUT-SYST-SCI-ENG, volume = "26", number = "5", pages = "??--??", month = sep, year = "2011", CODEN = "CSSEEI", ISSN = "0267-6192", ISSN-L = "0267-6192", bibdate = "Tue Dec 3 12:04:33 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/computsystscieng.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "International Journal of Computer Systems Science and Engineering", } @Article{VanDeGeijn:2011:HPD, author = "Robert A. {Van De Geijn} and Field G. {Van Zee}", title = "High-performance up-and-downdating via {Householder}-like transformations", journal = j-TOMS, volume = "38", number = "1", pages = "4:1--4:17", month = nov, year = "2011", CODEN = "ACMSCU", DOI = "https://doi.org/10.1145/2049662.2049666", ISSN = "0098-3500 (print), 1557-7295 (electronic)", ISSN-L = "0098-3500", bibdate = "Thu Dec 15 08:59:34 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/toms.bib", abstract = "We present high-performance algorithms for up-and-downdating a Cholesky factor or QR factorization. The method uses Householder-like transformations, sometimes called hyperbolic Householder transformations, that are accumulated so that most computation can be cast in terms of high-performance matrix-matrix operations. The resulting algorithms can then be used as building blocks for an algorithm-by-blocks that allows computation to be conveniently scheduled to multithreaded architectures like multicore processors. Performance is shown to be similar to that achieved by a blocked QR factorization via Householder transformations.", acknowledgement = ack-nhfb, articleno = "4", fjournal = "ACM Transactions on Mathematical Software (TOMS)", journal-URL = "http://dl.acm.org/pub.cfm?id=J782", } @Article{Vandierendonck:2011:FMM, author = "Hans Vandierendonck and Andre Seznec", title = "Fairness Metrics for Multi-Threaded Processors", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "10", number = "1", pages = "4--7", month = jan # "\slash " # jun, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2011.1", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Jun 20 17:18:18 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Multi-threaded processors execute multiple threads concurrently in order to increase overall throughput. It is well documented that multi-threading affects per-thread performance but, more importantly, some threads are affected more than others. This is especially troublesome for multi-programmed workloads. Fairness metrics measure whether all threads are affected equally. However defining equal treatment is not straightforward. Several fairness metrics for multi-threaded processors have been utilized in the literature, although there does not seem to be a consensus on what metric does the best job of measuring fairness. This paper reviews the prevalent fairness metrics and analyzes their main properties. Each metric strikes a different trade-off between fairness in the strict sense and throughput. We categorize the metrics with respect to this property. Based on experimental data for SMT processors, we suggest using the minimum fairness metric in order to balance fairness and throughput.", acknowledgement = ack-nhfb, affiliation = "Vandierendonck, H (Reprint Author), Univ Ghent, Dept Elect \& Informat Syst, Ghent, Belgium. Vandierendonck, Hans, Univ Ghent, Dept Elect \& Informat Syst, Ghent, Belgium. Seznec, Andre, INRIA Rennes, Rennes, France.", author-email = "hans.vandierendonck@elis.ugent.be Andre.Seznec@inria.fr", da = "2019-06-20", doc-delivery-number = "773ZN", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "fairness; measurement; multi-programming; Multi-threaded processors; quality-of-service", number-of-cited-references = "11", research-areas = "Computer Science", times-cited = "13", unique-id = "Vandierendonck:2011:FMM", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Vandierendonck:2011:MSR, author = "Hans Vandierendonck and Andr{\'e} Seznec", title = "Managing {SMT} resource usage through speculative instruction window weighting", journal = j-TACO, volume = "8", number = "3", pages = "12:1--12:??", month = oct, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2019608.2019611", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Oct 22 09:15:12 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Simultaneous multithreading processors dynamically share processor resources between multiple threads. In general, shared SMT resources may be managed explicitly, for instance, by dynamically setting queue occupation bounds for each thread as in the DCRA and Hill-Climbing policies. Alternatively, resources may be managed implicitly; that is, resource usage is controlled by placing the desired instruction mix in the resources. In this case, the main resource management tool is the instruction fetch policy which must predict the behavior of each thread (branch mispredictions, long-latency loads, etc.) as it fetches instructions. In this article, we present the use of Speculative Instruction Window Weighting (SIWW) to bridge the gap between implicit and explicit SMT fetch policies.", acknowledgement = ack-nhfb, articleno = "12", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924", } @Article{Yu:2011:SDH, author = "Wing-kei S. Yu and Ruirui Huang and Sarah Q. Xu and Sung-En Wang and Edwin Kan and G. Edward Suh", title = "{SRAM--DRAM} hybrid memory with applications to efficient register files in fine-grained multi-threading", journal = j-COMP-ARCH-NEWS, volume = "39", number = "3", pages = "247--258", month = jun, year = "2011", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2024723.2000094", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 5 17:15:11 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Zhao:2011:DCC, author = "Qin Zhao and David Koh and Syed Raza and Derek Bruening and Weng-Fai Wong and Saman Amarasinghe", title = "Dynamic cache contention detection in multi-threaded applications", journal = j-SIGPLAN, volume = "46", number = "7", pages = "27--38", month = jul, year = "2011", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2007477.1952688", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Fri Sep 16 10:02:34 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Zhu:2011:TPS, author = "David (Yu) Zhu and Jaeyeon Jung and Dawn Song and Tadayoshi Kohno and David Wetherall", title = "{TaintEraser}: protecting sensitive data leaks using application-level taint tracking", journal = j-OPER-SYS-REV, volume = "45", number = "1", pages = "142--154", month = jan, year = "2011", CODEN = "OSRED8", DOI = "https://doi.org/10.1145/1945023.1945039", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Fri Feb 25 16:43:23 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "We present TaintEraser, a new tool that tracks the movement of sensitive user data as it flows through off-the-shelf applications. TaintEraser uses application-level dynamic taint analysis to let users run applications in their own environment while preventing unwanted information exposure. It is made possible by techniques we developed for accurate and efficient tainting: (1) Semantic-aware instruction-level tainting is critical to track taint accurately, without explosion or loss. (2) Function summaries provide an interface to handle taint propagation within the kernel and reduce the overhead of instruction-level tracking. (3) On-demand instrumentation enables fast loading of large applications. Together, these techniques let us analyze large, multi-threaded, networked applications in near real-time.", acknowledgement = ack-nhfb, fjournal = "ACM SIGOPS Operating Systems Review", } @Article{Zhuang:2011:CST, author = "Xiaotong Zhuang and Santosh Pande", title = "Compiler-Supported Thread Management for Multithreaded Network Processors", journal = j-TECS, volume = "10", number = "4", pages = "44:1--44:??", month = nov, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2043662.2043668", ISSN = "1539-9087 (print), 1558-3465 (electronic)", ISSN-L = "1539-9087", bibdate = "Mon Dec 19 15:49:06 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/tecs.bib", abstract = "Traditionally, runtime management involving CPU sharing, real-time scheduling, etc., is provided by the runtime environment (typically an operating system) using hardware support such as timers and interrupts. However, due to stringent performance requirements on network processors, neither OS nor hardware mechanisms are typically feasible/available. Mapping packet processing tasks on network processors involves complex trade-offs to maximize parallelism and pipelining. Due to an increase in the size of the code store and complexity of application requirements, network processors are being programmed with heterogeneous threads that may execute code belonging to different tasks on a given micro-engine. Also, most network applications are streaming applications that are typically processed in a pipelined fashion.", acknowledgement = ack-nhfb, articleno = "44", fjournal = "ACM Transactions on Embedded Computing Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J840", } @Article{Ahn:2012:ISE, author = "Jung Ho Ahn and Norman P. Jouppi and Christos Kozyrakis and Jacob Leverich and Robert S. Schreiber", title = "Improving System Energy Efficiency with Memory Rank Subsetting", journal = j-TACO, volume = "9", number = "1", pages = "4:1--4:??", month = mar, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2133382.2133386", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Mar 30 17:45:35 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "VLSI process technology scaling has enabled dramatic improvements in the capacity and peak bandwidth of DRAM devices. However, current standard DDR x DIMM memory interfaces are not well tailored to achieve high energy efficiency and performance in modern chip-multiprocessor-based computer systems. Their suboptimal performance and energy inefficiency can have a significant impact on system-wide efficiency since much of the system power dissipation is due to memory power. New memory interfaces, better suited for future many-core systems, are needed. In response, there are recent proposals to enhance the energy efficiency of main-memory systems by dividing a memory rank into subsets, and making a subset rather than a whole rank serve a memory request. We holistically assess the effectiveness of rank subsetting from system-wide performance, energy-efficiency, and reliability perspectives. We identify the impact of rank subsetting on memory power and processor performance analytically, compare two promising rank-subsetting proposals, Multicore DIMM and mini-rank, and verify our analysis by simulating a chip-multiprocessor system using multithreaded and consolidated workloads. We extend the design of Multicore DIMM for high-reliability systems and show that compared with conventional chipkill approaches, rank subsetting can lead to much higher system-level energy efficiency and performance at the cost of additional DRAM devices. This holistic assessment shows that rank subsetting offers compelling alternatives to existing processor-memory interfaces for future DDR systems.", acknowledgement = ack-nhfb, articleno = "4", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924", } @Article{Aliaga:2012:SDG, author = "Jos{\'e} I. Aliaga and Paolo Bientinesi and Davor Davidovi{\'c} and Edoardo {Di Napoli} and Francisco D. Igual and Enrique S. Quintana-Ort{\'\i}", title = "Solving dense generalized eigenproblems on multi-threaded architectures", journal = j-APPL-MATH-COMP, volume = "218", number = "22", pages = "11279--11289", day = "15", month = jul, year = "2012", CODEN = "AMHCBQ", DOI = "https://doi.org/10.1016/j.amc.2012.05.020", ISSN = "0096-3003 (print), 1873-5649 (electronic)", ISSN-L = "0096-3003", bibdate = "Mon Jun 25 12:18:46 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/applmathcomput2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.sciencedirect.com/science/journal/00963003", URL = "http://www.sciencedirect.com/science/article/pii/S009630031200505X", acknowledgement = ack-nhfb, fjournal = "Applied Mathematics and Computation", journal-URL = "http://www.sciencedirect.com/science/journal/00963003", } @Article{Arnau:2012:BMG, author = "Jos{\'e}-Mar{\'\i}a Arnau and Joan-Manuel Parcerisa and Polychronis Xekalakis", title = "Boosting mobile {GPU} performance with a decoupled access\slash execute fragment processor", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "84--93", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337169", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "Smartphones represent one of the fastest growing markets, providing significant hardware/software improvements every few months. However, supporting these capabilities reduces the operating time per battery charge. The CPU/GPU component is only left with a shrinking fraction of the power budget, since most of the energy is consumed by the screen and the antenna. In this paper, we focus on improving the energy efficiency of the GPU since graphical applications consist an important part of the existing market. Moreover, the trend towards better screens will inevitably lead to a higher demand for improved graphics rendering. We show that the main bottleneck for these applications is the texture cache and that traditional techniques for hiding memory latency (prefetching, multithreading) do not work well or come at a high energy cost. We thus propose the migration of GPU designs towards the decoupled access-execute concept. Furthermore, we significantly reduce bandwidth usage in the decoupled architecture by exploiting inter-core data sharing. Using commercial Android applications, we show that the end design can achieve 93\% of the performance of a heavily multithreaded GPU while providing energy savings of 34\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Baghsorkhi:2012:EPE, author = "Sara S. Baghsorkhi and Isaac Gelado and Matthieu Delahaye and Wen-mei W. Hwu", title = "Efficient performance evaluation of memory hierarchy for highly multithreaded graphics processors", journal = j-SIGPLAN, volume = "47", number = "8", pages = "23--34", month = aug, year = "2012", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2370036.2145820", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Sep 12 12:11:57 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "PPOPP '12 conference proceedings.", abstract = "With the emergence of highly multithreaded architectures, performance monitoring techniques face new challenges in efficiently locating sources of performance discrepancies in the program source code. For example, the state-of-the-art performance counters in highly multithreaded graphics processing units (GPUs) report only the overall occurrences of microarchitecture events at the end of program execution. Furthermore, even if supported, any fine-grained sampling of performance counters will distort the actual program behavior and will make the sampled values inaccurate. On the other hand, it is difficult to achieve high resolution performance information at low sampling rates in the presence of thousands of concurrently running threads. In this paper, we present a novel software-based approach for monitoring the memory hierarchy performance in highly multithreaded general-purpose graphics processors. The proposed analysis is based on memory traces collected for snapshots of an application execution. A trace-based memory hierarchy model with a Monte Carlo experimental methodology generates statistical bounds of performance measures without being concerned about the exact inter-thread ordering of individual events but rather studying the behavior of the overall system. The statistical approach overcomes the classical problem of disturbed execution timing due to fine-grained instrumentation. The approach scales well as we deploy an efficient parallel trace collection technique to reduce the trace generation overhead and a simple memory hierarchy model to reduce the simulation time. The proposed scheme also keeps track of individual memory operations in the source code and can quantify their efficiency with respect to the memory system. A cross-validation of our results shows close agreement with the values read from the hardware performance counters on an NVIDIA Tesla C2050 GPU. Based on the high resolution profile data produced by our model we optimized memory accesses in the sparse matrix vector multiply kernel and achieved speedups ranging from 2.4 to 14.8 depending on the characteristics of the input matrices.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Bouajjani:2012:ARP, author = "Ahmed Bouajjani and Michael Emmi", title = "Analysis of recursively parallel programs", journal = j-SIGPLAN, volume = "47", number = "1", pages = "203--214", month = jan, year = "2012", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2103621.2103681", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Thu Mar 15 18:16:55 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "We propose a general formal model of isolated hierarchical parallel computations, and identify several fragments to match the concurrency constructs present in real-world programming languages such as Cilk and X10. By associating fundamental formal models (vector addition systems with recursive transitions) to each fragment, we provide a common platform for exposing the relative difficulties of algorithmic reasoning. For each case we measure the complexity of deciding state-reachability for finite-data recursive programs, and propose algorithms for the decidable cases. The complexities which include PTIME, NP, EXPSPACE, and 2EXPTIME contrast with undecidable state-reachability for recursive multi-threaded programs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "POPL '12 conference proceedings.", } @Article{Burgess:2012:EFL, author = "David Burgess and Edmund Gieske and James Holt and Thomas Hoy and Gary Whisenhunt", title = "{e6500}: {Freescale}'s Low-Power, High-Performance Multithreaded Embedded Processor", journal = j-IEEE-MICRO, volume = "32", number = "5", pages = "26--36", month = sep # "\slash " # oct, year = "2012", CODEN = "IEMIDZ", DOI = "https://doi.org/10.1109/MM.2012.55", ISSN = "0272-1732 (print), 1937-4143 (electronic)", ISSN-L = "0272-1732", bibdate = "Thu Nov 15 05:59:33 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeemicro.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Micro", journal-URL = "http://www.computer.org/csdl/mags/mi/index.html", } @Article{Burnim:2012:SCS, author = "Jacob Burnim and George Necula and Koushik Sen", title = "Specifying and checking semantic atomicity for multithreaded programs", journal = j-SIGPLAN, volume = "47", number = "4", pages = "79--90", month = apr, year = "2012", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2248487.1950377", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Thu Jun 7 08:15:03 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "In practice, it is quite difficult to write correct multithreaded programs due to the potential for unintended and nondeterministic interference between parallel threads. A fundamental correctness property for such programs is atomicity---a block of code in a program is atomic if, for any parallel execution of the program, there is an execution with the same overall program behavior in which the block is executed serially. We propose semantic atomicity, a generalization of atomicity with respect to a programmer-defined notion of equivalent behavior. We propose an assertion framework in which a programmer can use bridge predicates to specify noninterference properties at the level of abstraction of their application. Further, we propose a novel algorithm for systematically testing atomicity specifications on parallel executions with a bounded number of interruptions---i.e. atomic blocks whose execution is interleaved with that of other threads. We further propose a set of sound heuristics and optional user annotations that increase the efficiency of checking atomicity specifications in the common case where the specifications hold. We have implemented our assertion framework for specifying and checking semantic atomicity for parallel Java programs, and we have written semantic atomicity specifications for a number of benchmarks. We found that using bridge predicates allowed us to specify the natural and intended atomic behavior of a wider range of programs than did previous approaches. Further, in checking our specifications, we found several previously unknown bugs, including in the widely-used java.util.concurrent library.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "ASPLOS '12 conference proceedings.", } @Article{Catalyurek:2012:GCA, author = "{\"U}mit V. {\c{C}}ataly{\"u}rek and John Feo and Assefaw H. Gebremedhin and Mahantesh Halappanavar and Alex Pothen", title = "Graph coloring algorithms for multi-core and massively multithreaded architectures", journal = j-PARALLEL-COMPUTING, volume = "38", number = "10--11", pages = "576--594", month = oct # "\slash " # nov, year = "2012", CODEN = "PACOEJ", DOI = "https://doi.org/10.1016/j.parco.2012.07.001", ISSN = "0167-8191 (print), 1872-7336 (electronic)", ISSN-L = "0167-8191", bibdate = "Thu Oct 25 09:00:31 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib; http://www.sciencedirect.com/science/journal/01678191", URL = "http://www.sciencedirect.com/science/article/pii/S0167819112000592", acknowledgement = ack-nhfb, fjournal = "Parallel Computing", journal-URL = "http://www.sciencedirect.com/science/journal/01678191", } @InProceedings{Chen:2012:CLA, author = "Guancheng Chen and Per Stenstrom", title = "Critical lock analysis: diagnosing critical section bottlenecks in multithreaded applications", crossref = "Hollingsworth:2012:SPI", pages = "71:1--71:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a099.pdf", abstract = "Critical sections are well known potential performance bottlenecks in multithreaded applications and identifying the ones that inhibit scalability are important for performance optimizations. While previous approaches use idle time as a key measure, we show such a measure is not reliable. The reason is that idleness does not necessarily mean the critical section is on the critical path. We introduce critical lock analysis, a new method for diagnosing critical section bottlenecks in multithreaded applications. Our method firstly identifies the critical sections appearing on the critical path, and then quantifies the impact of such critical sections on the overall performance by using quantitative performance metrics. Case studies show that our method can successfully identify critical sections that are most beneficial for improving overall performance as well as quantify their performance impact on the critical path, which results in a more reliable establishment of the inherent critical section bottlenecks than previous approaches.", acknowledgement = ack-nhfb, articleno = "71", } @Article{Chen:2012:MLS, author = "Chih-Yuan Chen and Jhong-Yi Ciou and Rong-Guey Chang", title = "Multi-level simultaneous multithreading scheduling to reduce the temperature of register files", journal = j-CCPE, volume = "24", number = "12", pages = "1296--1316", day = "25", month = aug, year = "2012", CODEN = "CCPEBO", DOI = "https://doi.org/10.1002/cpe.1831", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Mon Nov 5 07:44:51 MST 2012", bibsource = "http://www.interscience.wiley.com/jpages/1532-0626; https://www.math.utah.edu/pub/tex/bib/ccpe.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Concurrency and Computation: Practice and Experience", journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626", onlinedate = "22 Sep 2011", } @Article{Clements:2012:SAS, author = "Austin T. Clements and M. Frans Kaashoek and Nickolai Zeldovich", title = "Scalable address spaces using {RCU} balanced trees", journal = j-COMP-ARCH-NEWS, volume = "40", number = "1", pages = "199--210", month = mar, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2189750.2150998", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Jun 1 17:06:46 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ASPLOS '12 conference proceedings.", abstract = "Software developers commonly exploit multicore processors by building multithreaded software in which all threads of an application share a single address space. This shared address space has a cost: kernel virtual memory operations such as handling soft page faults, growing the address space, mapping files, etc. can limit the scalability of these applications. In widely-used operating systems, all of these operations are synchronized by a single per-process lock. This paper contributes a new design for increasing the concurrency of kernel operations on a shared address space by exploiting read-copy-update (RCU) so that soft page faults can both run in parallel with operations that mutate the same address space and avoid contending with other page faults on shared cache lines. To enable such parallelism, this paper also introduces an RCU-based binary balanced tree for storing memory mappings. An experimental evaluation using three multithreaded applications shows performance improvements on 80 cores ranging from 1.7x to 3.4x for an implementation of this design in the Linux 2.6.37 kernel. The RCU-based binary tree enables soft page faults to run at a constant cost with an increasing number of cores,suggesting that the design will scale well beyond 80 cores.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Devietti:2012:RAS, author = "Joseph Devietti and Benjamin P. Wood and Karin Strauss and Luis Ceze and Dan Grossman and Shaz Qadeer", title = "{RADISH}: always-on sound and complete {{\underline{Ra}ce \underline{D}etection \underline{i}n \underline{S}oftware and \underline{H}ardware}}", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "201--212", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337182", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "Data-race freedom is a valuable safety property for multithreaded programs that helps with catching bugs, simplifying memory consistency model semantics, and verifying and enforcing both atomicity and determinism. Unfortunately, existing software-only dynamic race detectors are precise but slow; proposals with hardware support offer higher performance but are imprecise. Both precision and performance are necessary to achieve the many advantages always-on dynamic race detection could provide. To resolve this trade-off, we propose Radish, a hybrid hardware-software dynamic race detector that is always-on and fully precise. In Radish, hardware caches a principled subset of the metadata necessary for race detection; this subset allows the vast majority of race checks to occur completely in hardware. A flexible software layer handles persistence of race detection metadata on cache evictions and occasional queries to this expanded set of metadata. We show that Radish is correct by proving equivalence to a conventional happens-before race detector. Our design has modest hardware complexity: caches are completely unmodified and we piggy-back on existing coherence messages but do not otherwise modify the protocol. Furthermore, Radish can leverage type-safe languages to reduce overheads substantially. Our evaluation of a simulated 8-core Radish processor using PARSEC benchmarks shows runtime overheads from negligible to 2x, outperforming the leading software-only race detector by 2x-37x.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Devietti:2012:RRC, author = "Joseph Devietti and Jacob Nelson and Tom Bergan and Luis Ceze and Dan Grossman", title = "{RCDC}: a relaxed consistency deterministic computer", journal = j-SIGPLAN, volume = "47", number = "4", pages = "67--78", month = apr, year = "2012", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2248487.1950376", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Thu Jun 7 08:15:03 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "Providing deterministic execution significantly simplifies the debugging, testing, replication, and deployment of multithreaded programs. Recent work has developed deterministic multiprocessor architectures as well as compiler and runtime systems that enforce determinism in current hardware. Such work has incidentally imposed strong memory-ordering properties. Historically, memory ordering has been relaxed in favor of higher performance in shared memory multiprocessors and, interestingly, determinism exacerbates the cost of strong memory ordering. Consequently, we argue that relaxed memory ordering is vital to achieving faster deterministic execution. This paper introduces RCDC, a deterministic multiprocessor architecture that takes advantage of relaxed memory orderings to provide high-performance deterministic execution with low hardware complexity. RCDC has two key innovations: a hybrid HW/SW approach to enforcing determinism; and a new deterministic execution strategy that leverages data-race-free-based memory models (e.g., the models for Java and C++) to improve performance and scalability without sacrificing determinism, even in the presence of races. In our hybrid HW/SW approach, the only hardware mechanisms required are software-controlled store buffering and support for precise instruction counting; we do not require speculation. A runtime system uses these mechanisms to enforce determinism for arbitrary programs. We evaluate RCDC using PARSEC benchmarks and show that relaxing memory ordering leads to performance and scalability close to nondeterministic execution without requiring any form of speculation. We also compare our new execution strategy to one based on TSO (total-store-ordering) and show that some applications benefit significantly from the extra relaxation. We also evaluate a software-only implementation of our new deterministic execution strategy.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "ASPLOS '12 conference proceedings.", } @InProceedings{Ding:2012:CDF, author = "Wei Ding and Yuanrui Zhang and Mahmut Kandemir and Seung Woo Son", title = "Compiler-directed file layout optimization for hierarchical storage systems", crossref = "Hollingsworth:2012:SPI", pages = "41:1--41:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a030.pdf", abstract = "File layout of array data is a critical factor that effects the behavior of storage caches, and has so far taken not much attention in the context of hierarchical storage systems. The main contribution of this paper is a compiler-driven file layout optimization scheme for hierarchical storage caches. This approach, fully automated within an optimizing compiler, analyzes a multi-threaded application code and determines a file layout for each disk-resident array referenced by the code, such that the performance of the target storage cache hierarchy is maximized. We tested our approach using 16 I/O intensive application programs and compared its performance against two previously proposed approaches under different cache space management schemes. Our experimental results show that the proposed approach improves the execution time of these parallel applications by 23.7\% on average.", acknowledgement = ack-nhfb, articleno = "41", } @Article{Dolby:2012:DCA, author = "Julian Dolby and Christian Hammer and Daniel Marino and Frank Tip and Mandana Vaziri and Jan Vitek", title = "A data-centric approach to synchronization", journal = j-TOPLAS, volume = "34", number = "1", pages = "4:1--4:48", month = apr, year = "2012", CODEN = "ATPSDT", DOI = "https://doi.org/10.1145/2160910.2160913", ISSN = "0164-0925 (print), 1558-4593 (electronic)", ISSN-L = "0164-0925", bibdate = "Mon Apr 30 17:20:50 MDT 2012", bibsource = "http://www.acm.org/pubs/contents/journals/toplas/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/toplas.bib", abstract = "Concurrency-related errors, such as data races, are frustratingly difficult to track down and eliminate in large object-oriented programs. Traditional approaches to preventing data races rely on protecting instruction sequences with synchronization operations. Such control-centric approaches are inherently brittle, as the burden is on the programmer to ensure that all concurrently accessed memory locations are consistently protected. Data-centric synchronization is an alternative approach that offloads some of the work on the language implementation. Data-centric synchronization groups fields of objects into atomic sets to indicate that these fields must always be updated atomically. Each atomic set has associated units of work, that is, code fragments that preserve the consistency of that atomic set. Synchronization operations are added automatically by the compiler. We present an extension to the Java programming language that integrates annotations for data-centric concurrency control. The resulting language, called AJ, relies on a type system that enables separate compilation and supports atomic sets that span multiple objects and that also supports full encapsulation for more efficient code generation. We evaluate our proposal by refactoring classes from standard libraries, as well as a number of multithreaded benchmarks, to use atomic sets. Our results suggest that data-centric synchronization is easy to use and enjoys low annotation overhead, while successfully preventing data races. Moreover, experiments on the SPECjbb benchmark suggest that acceptable performance can be achieved with a modest amount of tuning.", acknowledgement = ack-nhfb, articleno = "4", fjournal = "ACM Transactions on Programming Languages and Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783", } @Article{Esmaeilzadeh:2012:LBL, author = "Hadi Esmaeilzadeh and Ting Cao and Yang Xi and Stephen M. Blackburn and Kathryn S. McKinley", title = "Looking back on the language and hardware revolutions: measured power, performance, and scaling", journal = j-SIGPLAN, volume = "47", number = "4", pages = "319--332", month = apr, year = "2012", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2248487.1950402", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Thu Jun 7 08:15:03 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "This paper reports and analyzes measured chip power and performance on five process technology generations executing 61 diverse benchmarks with a rigorous methodology. We measure representative Intel IA32 processors with technologies ranging from 130nm to 32nm while they execute sequential and parallel benchmarks written in native and managed languages. During this period, hardware and software changed substantially: (1) hardware vendors delivered chip multiprocessors instead of uniprocessors, and independently (2) software developers increasingly chose managed languages instead of native languages. This quantitative data reveals the extent of some known and previously unobserved hardware and software trends. Two themes emerge. (I) Workload: The power, performance, and energy trends of native workloads do not approximate managed workloads. For example, (a) the SPEC CPU2006 native benchmarks on the i7 (45) and i5 (32) draw significantly less power than managed or scalable native benchmarks; and (b) managed runtimes exploit parallelism even when running single-threaded applications. The results recommend architects always include native and managed workloads when designing and evaluating energy efficient hardware. (II) Architecture: Clock scaling, microarchitecture, simultaneous multithreading, and chip multiprocessors each elicit a huge variety of power, performance, and energy responses. This variety and the difficulty of obtaining power measurements recommends exposing on-chip power meters and when possible structure specific power meters for cores, caches, and other structures. Just as hardware event counters provide a quantitative grounding for performance innovations, power meters are necessary for optimizing energy.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "ASPLOS '12 conference proceedings.", } @Article{Eyerman:2012:PMJ, author = "Stijn Eyerman and Lieven Eeckhout", title = "Probabilistic modeling for job symbiosis scheduling on {SMT} processors", journal = j-TACO, volume = "9", number = "2", pages = "7:1--7:??", month = jun, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2207222.2207223", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Jun 13 17:20:51 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Symbiotic job scheduling improves simultaneous multithreading (SMT) processor performance by coscheduling jobs that have ``compatible'' demands on the processor's shared resources. Existing approaches however require a sampling phase, evaluate a limited number of possible coschedules, use heuristics to gauge symbiosis, are rigid in their optimization target, and do not preserve system-level priorities/shares. This article proposes probabilistic job symbiosis modeling, which predicts whether jobs will create positive or negative symbiosis when coscheduled without requiring the coschedule to be evaluated. The model, which uses per-thread cycle stacks computed through a previously proposed cycle accounting architecture, is simple enough to be used in system software. Probabilistic job symbiosis modeling provides six key innovations over prior work in symbiotic job scheduling: (i) it does not require a sampling phase, (ii) it readjusts the job coschedule continuously, (iii) it evaluates a large number of possible coschedules at very low overhead, (iv) it is not driven by heuristics, (v) it can optimize a performance target of interest (e.g., system throughput or job turnaround time), and (vi) it preserves system-level priorities/shares. These innovations make symbiotic job scheduling both practical and effective. Our experimental evaluation, which assumes a realistic scenario in which jobs come and go, reports an average 16\% (and up to 35\%) reduction in job turnaround time compared to the previously proposed SOS (sample, optimize, symbios) approach for a two-thread SMT processor, and an average 19\% (and up to 45\%) reduction in job turnaround time for a four-thread SMT processor.", acknowledgement = ack-nhfb, articleno = "7", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924", } @Article{Farzan:2012:VPC, author = "Azadeh Farzan and Zachary Kincaid", title = "Verification of parameterized concurrent programs by modular reasoning about data and control", journal = j-SIGPLAN, volume = "47", number = "1", pages = "297--308", month = jan, year = "2012", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2103621.2103693", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Thu Mar 15 18:16:55 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "In this paper, we consider the problem of verifying thread-state properties of multithreaded programs in which the number of active threads cannot be statically bounded. Our approach is based on decomposing the task into two modules, where one reasons about data and the other reasons about control. The data module computes thread-state invariants (e.g., linear constraints over global variables and local variables of one thread) using the thread interference information computed by the control module. The control module computes a representation of thread interference, as an incrementally constructed data flow graph, using the data invariants provided by the data module. These invariants are used to rule out patterns of thread interference that can not occur in a real program execution. The two modules are incorporated into a feedback loop, so that the abstractions of data and interference are iteratively coarsened as the algorithm progresses (that is, they become weaker) until a fixed point is reached. Our approach is sound and terminating, and applicable to programs with infinite state (e.g., unbounded integers) and unboundedly many threads. The verification method presented in this paper has been implemented into a tool, called Duet. We demonstrate the effectiveness of our technique by verifying properties of a selection of Linux device drivers using Duet, and also compare Duet with previous work on verification of parameterized Boolean program using the Boolean abstractions of these drivers.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "POPL '12 conference proceedings.", } @Article{Foltzer:2012:MSP, author = "Adam Foltzer and Abhishek Kulkarni and Rebecca Swords and Sajith Sasidharan and Eric Jiang and Ryan Newton", title = "A meta-scheduler for the par-monad: composable scheduling for the heterogeneous cloud", journal = j-SIGPLAN, volume = "47", number = "9", pages = "235--246", month = sep, year = "2012", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2398856.2364562", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Thu Nov 15 16:40:19 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "Modern parallel computing hardware demands increasingly specialized attention to the details of scheduling and load balancing across heterogeneous execution resources that may include GPU and cloud environments, in addition to traditional CPUs. Many existing solutions address the challenges of particular resources, but do so in isolation, and in general do not compose within larger systems. We propose a general, composable abstraction for execution resources, along with a continuation-based meta-scheduler that harnesses those resources in the context of a deterministic parallel programming library for Haskell. We demonstrate performance benefits of combined CPU/GPU scheduling over either alone, and of combined multithreaded/distributed scheduling over existing distributed programming approaches for Haskell.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "ICFP '12 conference proceedings.", } @InProceedings{Garland:2012:DUP, author = "Michael Garland and Manjunath Kudlur and Yili Zheng", title = "Designing a unified programming model for heterogeneous machines", crossref = "Hollingsworth:2012:SPI", pages = "67:1--67:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/pvm.bib; https://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a064.pdf", abstract = "While high-efficiency machines are increasingly embracing heterogeneous architectures and massive multithreading, contemporary mainstream programming languages reflect a mental model in which processing elements are homogeneous, concurrency is limited, and memory is a flat undifferentiated pool of storage. Moreover, the current state of the art in programming heterogeneous machines tends towards using separate programming models, such as OpenMP and CUDA, for different portions of the machine. Both of these factors make programming emerging heterogeneous machines unnecessarily difficult. We describe the design of the Phalanx programming model, which seeks to provide a unified programming model for heterogeneous machines. It provides constructs for bulk parallelism, synchronization, and data placement which operate across the entire machine. Our prototype implementation is able to launch and coordinate work on both CPU and GPU processors within a single node, and by leveraging the GASNet runtime, is able to run across all the nodes of a distributed-memory machine.", acknowledgement = ack-nhfb, articleno = "67", } @Article{Gebhart:2012:HTS, author = "Mark Gebhart and Daniel R. Johnson and David Tarjan and Stephen W. Keckler and William J. Dally and Erik Lindholm and Kevin Skadron", title = "A Hierarchical Thread Scheduler and Register File for Energy-Efficient Throughput Processors", journal = j-TOCS, volume = "30", number = "2", pages = "8:1--8:??", month = apr, year = "2012", CODEN = "ACSYEC", DOI = "https://doi.org/10.1145/2166879.2166882", ISSN = "0734-2071 (print), 1557-7333 (electronic)", ISSN-L = "0734-2071", bibdate = "Fri Apr 27 12:10:22 MDT 2012", bibsource = "http://www.acm.org/pubs/contents/journals/tocs/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/tocs.bib", abstract = "Modern graphics processing units (GPUs) employ a large number of hardware threads to hide both function unit and memory access latency. Extreme multithreading requires a complex thread scheduler as well as a large register file, which is expensive to access both in terms of energy and latency. We present two complementary techniques for reducing energy on massively-threaded processors such as GPUs. First, we investigate a two-level thread scheduler that maintains a small set of active threads to hide ALU and local memory access latency and a larger set of pending threads to hide main memory latency. Reducing the number of threads that the scheduler must consider each cycle improves the scheduler's energy efficiency. Second, we propose replacing the monolithic register file found on modern designs with a hierarchical register file. We explore various trade-offs for the hierarchy including the number of levels in the hierarchy and the number of entries at each level. We consider both a hardware-managed caching scheme and a software-managed scheme, where the compiler is responsible for orchestrating all data movement within the register file hierarchy. Combined with a hierarchical register file, our two-level thread scheduler provides a further reduction in energy by only allocating entries in the upper levels of the register file hierarchy for active threads. Averaging across a variety of real world graphics and compute workloads, the active thread count can be reduced by a factor of 4 with minimal impact on performance and our most efficient three-level software-managed register file hierarchy reduces register file energy by 54\%.", acknowledgement = ack-nhfb, articleno = "8", fjournal = "ACM Transactions on Computer Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J774", } @Article{Grebenshchikov:2012:SSV, author = "Sergey Grebenshchikov and Nuno P. Lopes and Corneliu Popeea and Andrey Rybalchenko", title = "Synthesizing software verifiers from proof rules", journal = j-SIGPLAN, volume = "47", number = "6", pages = "405--416", month = jun, year = "2012", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2345156.2254112", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Mon Aug 6 16:31:49 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "PLDI '12 proceedings.", abstract = "Automatically generated tools can significantly improve programmer productivity. For example, parsers and dataflow analyzers can be automatically generated from declarative specifications in the form of grammars, which tremendously simplifies the task of implementing a compiler. In this paper, we present a method for the automatic synthesis of software verification tools. Our synthesis procedure takes as input a description of the employed proof rule, e.g., program safety checking via inductive invariants, and produces a tool that automatically discovers the auxiliary assertions required by the proof rule, e.g., inductive loop invariants and procedure summaries. We rely on a (standard) representation of proof rules using recursive equations over the auxiliary assertions. The discovery of auxiliary assertions, i.e., solving the equations, is based on an iterative process that extrapolates solutions obtained for finitary unrollings of equations. We show how our method synthesizes automatic safety and liveness verifiers for programs with procedures, multi-threaded programs, and functional programs. Our experimental comparison of the resulting verifiers with existing state-of-the-art verification tools confirms the practicality of the approach.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Halappanavar:2012:AWM, author = "Mahantesh Halappanavar and John Feo and Oreste Villa and Antonino Tumeo and Alex Pothen", title = "Approximate weighted matching on emerging manycore and multithreaded architectures", journal = j-IJHPCA, volume = "26", number = "4", pages = "413--430", month = nov, year = "2012", CODEN = "IHPCFL", DOI = "https://doi.org/10.1177/1094342012452893", ISSN = "1094-3420 (print), 1741-2846 (electronic)", ISSN-L = "1094-3420", bibdate = "Thu Nov 8 11:31:16 MST 2012", bibsource = "http://hpc.sagepub.com/content/26/4.toc; https://www.math.utah.edu/pub/tex/bib/ijsa.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://hpc.sagepub.com/content/26/4/413.full.pdf+html", acknowledgement = ack-nhfb, fjournal = "International Journal of High Performance Computing Applications", journal-URL = "http://hpc.sagepub.com/content/by/year", onlinedate = "August 9, 2012", } @Article{Hayden:2012:KEG, author = "Christopher M. Hayden and Edward K. Smith and Michail Denchev and Michael Hicks and Jeffrey S. Foster", title = "{Kitsune}: efficient, general-purpose dynamic software updating for {C}", journal = j-SIGPLAN, volume = "47", number = "10", pages = "249--264", month = oct, year = "2012", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2398857.2384635", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Thu Nov 15 16:40:23 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "Dynamic software updating (DSU) systems allow programs to be updated while running, thereby permitting developers to add features and fix bugs without downtime. This paper introduces Kitsune, a new DSU system for C whose design has three notable features. First, Kitsune's updating mechanism updates the whole program, not individual functions. This mechanism is more flexible than most prior approaches and places no restrictions on data representations or allowed compiler optimizations. Second, Kitsune makes the important aspects of updating explicit in the program text, making the program's semantics easy to understand while minimizing programmer effort. Finally, the programmer can write simple specifications to direct Kitsune to generate code that traverses and transforms old-version state for use by new code; such state transformation is often necessary, and is significantly more difficult in prior DSU systems. We have used Kitsune to update five popular, open-source, single- and multi-threaded programs, and find that few program changes are required to use Kitsune, and that it incurs essentially no performance overhead.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "OOPSLA '12 conference proceedings.", } @Article{Huang:2012:EPS, author = "Jeff Huang and Charles Zhang", title = "Execution privatization for scheduler-oblivious concurrent programs", journal = j-SIGPLAN, volume = "47", number = "10", pages = "737--752", month = oct, year = "2012", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2398857.2384670", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Thu Nov 15 16:40:23 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "Making multithreaded execution less non-deterministic is a promising solution to address the difficulty of concurrent programming plagued by the non-deterministic thread scheduling. In fact, a vast category of concurrent programs are scheduler-oblivious: their execution is deterministic, regardless of the scheduling behavior. We present and formally prove a fundamental observation of the privatizability property for scheduler-oblivious programs, that paves the theoretical foundation for privatizing shared data accesses on a path segment. With privatization, the non-deterministic thread interleavings on the privatized accesses are isolated and as the consequence many concurrency problems are alleviated. We further present a path and context sensitive privatization algorithm that safely privatizes the program without introducing any additional program behavior. Our evaluation results show that the privatization opportunity pervasively exists in real world large complex concurrent systems. Through privatization, several real concurrency bugs are fixed and notable performance improvements are also achieved on benchmarks.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "OOPSLA '12 conference proceedings.", } @Article{Ismail:2012:CMT, author = "I. A. Ismail and G. S. Mokaddis and Mariam K. Metry", title = "Computing a Matrix Transpose of Multithreading for Queuing Parallel in {Matlab} Programming", journal = j-INT-J-COMP-APPL, volume = "49", number = "??", pages = "41--47", month = jul, year = "2012", CODEN = "????", DOI = "https://doi.org/10.5120/7627-0694", ISSN = "0975-8887", ISSN-L = "0975-8887", bibdate = "Fri Jan 24 08:46:36 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/intjcompappl.bib; https://www.math.utah.edu/pub/tex/bib/matlab.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "https://www.ijcaonline.org/archives/volume49/number5/7627-0694/", acknowledgement = ack-nhfb, ajournal = "Intern. J. of Computer Applications", articleno = "5", fjournal = "International Journal of Computer Applications", journal-URL = "https://www.ijcaonline.org/", } @Article{Joao:2012:BIS, author = "Jos{\'e} A. Joao and M. Aater Suleman and Onur Mutlu and Yale N. Patt", title = "Bottleneck identification and scheduling in multithreaded applications", journal = j-COMP-ARCH-NEWS, volume = "40", number = "1", pages = "223--234", month = mar, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2189750.2151001", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Jun 1 17:06:46 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ASPLOS '12 conference proceedings.", abstract = "Performance of multithreaded applications is limited by a variety of bottlenecks, e.g. critical sections, barriers and slow pipeline stages. These bottlenecks serialize execution, waste valuable execution cycles, and limit scalability of applications. This paper proposes Bottleneck Identification and Scheduling in Multithreaded Applications (BIS), a cooperative software-hardware mechanism to identify and accelerate the most critical bottlenecks. BIS identifies which bottlenecks are likely to reduce performance by measuring the number of cycles threads have to wait for each bottleneck, and accelerates those bottlenecks using one or more fast cores on an Asymmetric Chip Multi-Processor (ACMP). Unlike previous work that targets specific bottlenecks, BIS can identify and accelerate bottlenecks regardless of their type. We compare BIS to four previous approaches and show that it outperforms the best of them by 15\% on average. BIS' performance improvement increases as the number of cores and the number of fast cores in the system increase.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Joisha:2012:TTE, author = "Pramod G. Joisha and Robert S. Schreiber and Prithviraj Banerjee and Hans-J. Boehm and Dhruva R. Chakrabarti", title = "On a Technique for Transparently Empowering Classical Compiler Optimizations on Multithreaded Code", journal = j-TOPLAS, volume = "34", number = "2", pages = "9:1--9:??", month = jun, year = "2012", CODEN = "ATPSDT", DOI = "https://doi.org/10.1145/2220365.2220368", ISSN = "0164-0925 (print), 1558-4593 (electronic)", ISSN-L = "0164-0925", bibdate = "Fri Jun 29 17:33:40 MDT 2012", bibsource = "http://www.acm.org/pubs/contents/journals/toplas/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/toplas.bib", abstract = "A large body of data-flow analyses exists for analyzing and optimizing sequential code. Unfortunately, much of it cannot be directly applied on parallel code, for reasons of correctness. This article presents a technique to automatically, aggressively, yet safely apply sequentially-sound data-flow transformations, without change, on shared-memory programs. The technique is founded on the notion of program references being ``siloed'' on certain control-flow paths. Intuitively, siloed references are free of interference from other threads within the confines of such paths. Data-flow transformations can, in general, be unblocked on siloed references. The solution has been implemented in a widely used compiler. Results on benchmarks from SPLASH-2 show that performance improvements of up to 41\% are possible, with an average improvement of 6\% across all the tested programs over all thread counts.", acknowledgement = ack-nhfb, articleno = "9", fjournal = "ACM Transactions on Programming Languages and Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783", } @Article{Kambadur:2012:HCA, author = "Melanie Kambadur and Kui Tang and Martha A. Kim", title = "{Harmony}: collection and analysis of parallel block vectors", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "452--463", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337211", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "Efficient execution of well-parallelized applications is central to performance in the multicore era. Program analysis tools support the hardware and software sides of this effort by exposing relevant features of multithreaded applications. This paper describes parallel block vectors, which uncover previously unseen characteristics of parallel programs. Parallel block vectors provide block execution profiles per concurrency phase (e.g., the block execution profile of all serial regions of a program). This information provides a direct and fine-grained mapping between an application's runtime parallel phases and the static code that makes up those phases. This paper also demonstrates how to collect parallel block vectors with minimal application perturbation using Harmony. Harmony is an instrumentation pass for the LLVM compiler that introduces just 16-21\% overhead on average across eight Parsec benchmarks. We apply parallel block vectors to uncover several novel insights about parallel applications with direct consequences for architectural design. First, that the serial and parallel phases of execution used in Amdahl's Law are often composed of many of the same basic blocks. Second, that program features, such as instruction mix, vary based on the degree of parallelism, with serial phases in particular displaying different instruction mixes from the program as a whole. Third, that dynamic execution frequencies do not necessarily correlate with a block's parallelism.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Kawaguchi:2012:DPL, author = "Ming Kawaguchi and Patrick Rondon and Alexander Bakst and Ranjit Jhala", title = "Deterministic parallelism via liquid effects", journal = j-SIGPLAN, volume = "47", number = "6", pages = "45--54", month = jun, year = "2012", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2345156.2254071", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Mon Aug 6 16:31:49 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "PLDI '12 proceedings.", abstract = "Shared memory multithreading is a popular approach to parallel programming, but also fiendishly hard to get right. We present Liquid Effects, a type-and-effect system based on refinement types which allows for fine-grained, low-level, shared memory multi-threading while statically guaranteeing that a program is deterministic. Liquid Effects records the effect of an expression as a for- mula in first-order logic, making our type-and-effect system highly expressive. Further, effects like Read and Write are recorded in Liquid Effects as ordinary uninterpreted predicates, leaving the effect system open to extension by the user. By building our system as an extension to an existing dependent refinement type system, our system gains precise value- and branch-sensitive reasoning about effects. Finally, our system exploits the Liquid Types refinement type inference technique to automatically infer refinement types and effects. We have implemented our type-and-effect checking techniques in CSOLVE, a refinement type inference system for C programs. We demonstrate how CSOLVE uses Liquid Effects to prove the determinism of a variety of benchmarks.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Keckler:2012:MMC, author = "Stephen W. Keckler and Steven K. Reinhardt", title = "Massively Multithreaded Computing Systems", journal = j-COMPUTER, volume = "45", number = "8", pages = "24--25", month = aug, year = "2012", CODEN = "CPTRB4", DOI = "https://doi.org/10.1109/MC.2012.270", ISSN = "0018-9162 (print), 1558-0814 (electronic)", ISSN-L = "0018-9162", bibdate = "Wed Aug 29 16:38:07 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/computer2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Computer", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2", } @InProceedings{Khan:2012:MAN, author = "Arif M. Khan and David F. Gleich and Alex Pothen and Mahantesh Halappanavar", title = "A multithreaded algorithm for network alignment via approximate matching", crossref = "Hollingsworth:2012:SPI", pages = "64:1--64:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a054.pdf", abstract = "Network alignment is an optimization problem to find the best one-to-one map between the vertices of a pair of graphs that overlaps as many edges as possible. It is a relaxation of the graph isomorphism problem and is closely related to the subgraph isomorphism problem. The best current approaches are entirely heuristic and iterative in nature. They generate real-valued heuristic weights that must be rounded to find integer solutions. This rounding requires solving a bipartite maximum weight matching problem at each iteration in order to avoid missing high quality solutions. We investigate substituting a parallel, half-approximation for maximum weight matching instead of an exact computation. Our experiments show that the resulting difference in solution quality is negligible. We demonstrate almost a 20-fold speedup using 40 threads on an 8 processor Intel Xeon E7-8870 system and now solve real-world problems in 36 seconds instead of 10 minutes.", acknowledgement = ack-nhfb, articleno = "64", } @Article{Khyzha:2012:AP, author = "Artem Khyzha and Pavel Par{\'\i}zek and Corina S. P{\u{a}}s{\u{a}}reanu", title = "Abstract pathfinder", journal = j-SIGSOFT, volume = "37", number = "6", pages = "1--5", month = nov, year = "2012", CODEN = "SFENDP", DOI = "https://doi.org/10.1145/2382756.2382794", ISSN = "0163-5948 (print), 1943-5843 (electronic)", ISSN-L = "0163-5948", bibdate = "Wed Aug 1 17:16:18 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/java2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigsoft2010.bib", abstract = "We present Abstract Pathfinder, an extension to the Java Pathfinder (JPF) verification tool-set that supports data abstraction to reduce the large data domains of a Java program to small, finite abstract domains, making the program more amenable to verification. We use data abstraction to compute an over-approximation of the original program in such a way that if a (safety) property is true in the abstracted program the property is also true in the original program. Our approach enhances JPF with an abstract interpreter and abstract state-matching mechanisms, together with a library of abstractions from which the user can pick which abstractions to use for a particular application. We discuss the details of our implementation together with some preliminary experiments with analyzing multi-threaded Java programs, where Abstract Pathfinder achieves significant time and memory savings as compared with plain JPF.", acknowledgement = ack-nhfb, fjournal = "ACM SIGSOFT Software Engineering Notes", journal-URL = "https://dl.acm.org/citation.cfm?id=J728", } @Article{Kyle:2012:EPI, author = "Stephen Kyle and Igor B{\"o}hm and Bj{\"o}rn Franke and Hugh Leather and Nigel Topham", title = "Efficiently parallelizing instruction set simulation of embedded multi-core processors using region-based just-in-time dynamic binary translation", journal = j-SIGPLAN, volume = "47", number = "5", pages = "21--30", month = may, year = "2012", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2345141.2248422", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Mon Aug 6 16:31:46 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "LCTES '12 proceedings.", abstract = "Embedded systems, as typified by modern mobile phones, are already seeing a drive toward using multi-core processors. The number of cores will likely increase rapidly in the future. Engineers and researchers need to be able to simulate systems, as they are expected to be in a few generations time, running simulations of many-core devices on today's multi-core machines. These requirements place heavy demands on the scalability of simulation engines, the fastest of which have typically evolved from just-in-time (Jit) dynamic binary translators (Dbt). Existing work aimed at parallelizing Dbt simulators has focused exclusively on trace-based Dbt, wherein linear execution traces or perhaps trees thereof are the units of translation. Region-based Dbt simulators have not received the same attention and require different techniques than their trace-based cousins. In this paper we develop an innovative approach to scaling multi-core, embedded simulation through region-based Dbt. We initially modify the Jit code generator of such a simulator to emit code that does not depend on a particular thread with its thread-specific context and is, therefore, thread-agnostic. We then demonstrate that this thread-agnostic code generation is comparable to thread-specific code with respect to performance, but also enables the sharing of JIT-compiled regions between different threads. This sharing optimisation, in turn, leads to significant performance improvements for multi-threaded applications. In fact, our results confirm that an average of 76\% of all JIT-compiled regions can be shared between 128 threads in representative, parallel workloads. We demonstrate that this translates into an overall performance improvement by 1.44x on average and up to 2.40x across 12 multi-threaded benchmarks taken from the Splash-2 benchmark suite, targeting our high-performance multi-core Dbt simulator for embedded Arc processors running on a 4-core Intel host machine.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Lakshminarayana:2012:DSP, author = "Nagesh B. Lakshminarayana and Jaekyu Lee and Hyesoon Kim and Jinwoo Shin", title = "{DRAM} Scheduling Policy for {GPGPU} Architectures Based on a Potential Function", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "11", number = "2", pages = "33--36", month = jul # "\slash " # dec, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2011.32", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Jun 20 17:18:18 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "GPGPU architectures (applications) have several different characteristics compared to traditional CPU architectures (applications): highly multithreaded architectures and SIMD-execution behavior are the two important characteristics of GPGPU computing. In this paper, we propose a potential function that models the DRAM behavior in GPGPU architectures and a DRAM scheduling policy, alpha-SJF policy to minimize the potential function. The scheduling policy essentially chooses between SJF and FR-FCFS at run-time based on the number of requests from each thread and whether the thread has a row buffer hit.", acknowledgement = ack-nhfb, affiliation = "Lakshminarayana, NB (Reprint Author), Georgia Inst Technol, Sch Comp Sci, Atlanta, GA 30332 USA. Lakshminarayana, Nagesh B.; Lee, Jaekyu; Kim, Hyesoon; Shin, Jinwoo, Georgia Inst Technol, Sch Comp Sci, Atlanta, GA 30332 USA.", author-email = "nageshbl@cc.gatech.edu jaekyu.lee@cc.gatech.edu hyesoon.kim@cc.gatech.edu jshin72@cc.gatech.edu", da = "2019-06-20", doc-delivery-number = "057JO", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "DRAM scheduling; GPGPU; Potential function", number-of-cited-references = "5", research-areas = "Computer Science", researcherid-numbers = "Shin, Jinwoo/M-5389-2013", times-cited = "7", unique-id = "Lakshminarayana:2012:DSP", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Leiserson:2012:DPR, author = "Charles E. Leiserson and Tao B. Schardl and Jim Sukha", title = "Deterministic parallel random-number generation for dynamic-multithreading platforms", journal = j-SIGPLAN, volume = "47", number = "8", pages = "193--204", month = aug, year = "2012", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2370036.2145841", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Sep 12 12:11:57 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "PPOPP '12 conference proceedings.", abstract = "Existing concurrency platforms for dynamic multithreading do not provide repeatable parallel random-number generators. This paper proposes that a mechanism called pedigrees be built into the runtime system to enable efficient deterministic parallel random-number generation. Experiments with the open-source MIT Cilk runtime system show that the overhead for maintaining pedigrees is negligible. Specifically, on a suite of 10 benchmarks, the relative overhead of Cilk with pedigrees to the original Cilk has a geometric mean of less than 1\%. We persuaded Intel to modify its commercial C/C++ compiler, which provides the Cilk Plus concurrency platform, to include pedigrees, and we built a library implementation of a deterministic parallel random-number generator called DotMix that compresses the pedigree and then ``RC6-mixes'' the result. The statistical quality of DotMix is comparable to that of the popular Mersenne twister, but somewhat slower than a nondeterministic parallel version of this efficient and high-quality serial random-number generator. The cost of calling DotMix depends on the ``spawn depth'' of the invocation. For a naive Fibonacci calculation with n=40 that calls DotMix in every node of the computation, this ``price of determinism'' is a factor of 2.65 in running time, but for more realistic applications with less intense use of random numbers --- such as a maximal-independent-set algorithm, a practical samplesort program, and a Monte Carlo discrete-hedging application from QuantLib --- the observed ``price'' was less than 5\%. Moreover, even if overheads were several times greater, applications using DotMix should be amply fast for debugging purposes, which is a major reason for desiring repeatability.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Li:2012:MRP, author = "Xin Li and Reinhard von Hanxleden", title = "Multithreaded Reactive Programming --- the {Kiel Esterel} Processor", journal = j-IEEE-TRANS-COMPUT, volume = "61", number = "3", pages = "337--349", month = mar, year = "2012", CODEN = "ITCOB4", DOI = "https://doi.org/10.1109/TC.2010.246", ISSN = "0018-9340 (print), 1557-9956 (electronic)", ISSN-L = "0018-9340", bibdate = "Fri Feb 3 07:35:03 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Computers", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12", } @Article{Ling:2012:HPP, author = "Cheng Ling and Khaled Benkrid and Tsuyoshi Hamada", title = "High performance phylogenetic analysis on {CUDA}-compatible {GPUs}", journal = j-COMP-ARCH-NEWS, volume = "40", number = "5", pages = "52--57", month = dec, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2460216.2460226", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Sun May 5 09:49:56 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/java2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "HEART '12 conference proceedings.", abstract = "The operation of phylogenetic analysis aims to investigate the evolution and relationships among species. It is widely used in the fields of system biology and comparative genomics. However, phylogenetic analysis is also a computationally intensive operation as the number of tree topology grows in a factorial way with the number of species involved. Therefore, due to the large number of species in the real world, the computational burden has largely thwarted phylogenetic reconstruction. In this paper, we describe the detailed GPU-based multi-threaded design and implementation of a Markov Chain Monte Carlo (MCMC) maximum likelihood algorithm for phylogenetic analysis on a set of aligned nucleotide sequences. The implementation is based on the framework of the most widely used phylogenetic analysis tool, namely MrBayes. The proposed approach resulted in 6x-8x speed-up on an NVidia Geforce 460 GTX GPU compared to an optimized GPP-based software implementation running on a desktop computer with a single Intel Xeon 2.53 GHz CPU and 6.0 GB RAM.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Liu:2012:FPA, author = "Gu Liu and Hong An and Wenting Han and Xiaoqiang Li and Tao Sun and Wei Zhou and Xuechao Wei and Xulong Tang", title = "{FlexBFS}: a parallelism-aware implementation of breadth-first search on {GPU}", journal = j-SIGPLAN, volume = "47", number = "8", pages = "279--280", month = aug, year = "2012", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2370036.2145853", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Sep 12 12:11:57 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", note = "PPOPP '12 conference proceedings.", abstract = "In this paper, we present FlexBFS, a parallelism-aware implementation for breadth-first search on GPU. Our implementation can adjust the computation resources according to the feedback of available parallelism dynamically. We also optimized our program in three ways: (1)a simplified two-level queue management,(2)a combined kernel strategy and (3)a high-degree vertices specialization approach. Our experimental results show that it can achieve 3~20 times speedup against the fastest serial version, and can outperform the TBB based multi-threading CPU version and the previous most effective GPU version on all types of input graphs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Mars:2012:BDS, author = "Jason Mars and Naveen Kumar", title = "{BlockChop}: dynamic squash elimination for hybrid processor architecture", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "536--547", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337221", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "Hybrid processors are HW/SW co-designed processors that leverage blocked-execution, the execution of regions of instructions as atomic blocks, to facilitate aggressive speculative optimization. As we move to a multicore hybrid design, fine grained conflicts for shared data can violate the atomicity requirement of these blocks and lead to expensive squashes and rollbacks. However, as these atomic regions differ from those used in checkpointing and transactional memory systems, the extent of this potentially prohibitive problem remains unclear, and mechanisms to mitigate these squashes dynamically may be critical to enable a highly per-formant multicore hybrid design. In this work, we investigate how multithreaded applications, both benchmark and commercial workloads, are affected by squashes, and present dynamic mechanisms for mitigating these squashes in hybrid processors. While the current wisdom is that there is not a significant number of squashes for smaller atomic regions, we observe this is not the case for many multithreaded workloads. With region sizes of just 200--500 instructions, we observe a performance degradation ranging from 10\% to more than 50\% for workloads with a mixture of shared reads and writes. By harnessing the unique flexibility provided by the software subsystem of hybrid processor design, we present BlockChop, a framework for dynamically mitigating squashes on multicore hybrid processors. We present a range of squash handling mechanisms leveraging retrials, interpretation, and retranslation, and find that BlockChop is quite effective. Over the current response to exceptions and squashes in a hybrid design, we are able to improve the performance of benchmark and commercial workloads by 1.4x and 1.2x on average for large and small region sizes respectively.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Miller:2012:VCE, author = "Timothy N. Miller and Renji Thomas and Xiang Pan and Radu Teodorescu", title = "{VRSync}: characterizing and eliminating synchronization-induced voltage emergencies in many-core processors", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "249--260", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337188", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "Power consumption is a primary concern for microprocessor designers. Lowering the supply voltage of processors is one of the most effective techniques for improving their energy efficiency. Unfortunately, low-voltage operation faces multiple challenges going forward. One such challenge is increased sensitivity to voltage fluctuations, which can trigger so-called ``voltage emergencies'' that can lead to errors. These fluctuations are caused by abrupt changes in power demand, triggered by processor activity variation as a function of workload. This paper examines the effects of voltage fluctuations on future many-core processors. With the increase in the number of cores in a chip, the effects of chip-wide activity fluctuation --- such as that caused by global synchronization in multithreaded applications --- overshadow the effects of core-level workload variability. Starting from this observation, we developed VRSync, a novel synchronization methodology that uses emergency-aware scheduling policies that reduce the slope of load fluctuations, eliminating emergencies. We show that VRSync is very effective at eliminating emergencies, allowing voltage guardbands to be significantly lowered, which reduces energy consumption by an average of 33\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Nagarakatte:2012:MAP, author = "Santosh Nagarakatte and Sebastian Burckhardt and Milo M. K. Martin and Madanlal Musuvathi", title = "Multicore acceleration of priority-based schedulers for concurrency bug detection", journal = j-SIGPLAN, volume = "47", number = "6", pages = "543--554", month = jun, year = "2012", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2345156.2254128", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Mon Aug 6 16:31:49 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "PLDI '12 proceedings.", abstract = "Testing multithreaded programs is difficult as threads can interleave in a nondeterministic fashion. Untested interleavings can cause failures, but testing all interleavings is infeasible. Many interleaving exploration strategies for bug detection have been proposed, but their relative effectiveness and performance remains unclear as they often lack publicly available implementations and have not been evaluated using common benchmarks. We describe NeedlePoint, an open-source framework that allows selection and comparison of a wide range of interleaving exploration policies for bug detection proposed by prior work. Our experience with NeedlePoint indicates that priority-based probabilistic concurrency testing (the PCT algorithm) finds bugs quickly, but it runs only one thread at a time, which destroys parallelism by serializing executions. To address this problem we propose a parallel version of the PCT algorithm~(PPCT). We show that the new algorithm outperforms the original by a factor of 5x when testing parallel programs on an eight-core machine. We formally prove that parallel PCT provides the same probabilistic coverage guarantees as PCT. Moreover, PPCT is the first algorithm that runs multiple threads while providing coverage guarantees.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Nagpal:2012:CGE, author = "Rahul Nagpal and Anasua Bhowmik", title = "Criticality guided energy aware speculation for speculative multithreaded processors", journal = j-PARALLEL-COMPUTING, volume = "38", number = "6--7", pages = "329--341", month = jun # "\slash " # jul, year = "2012", CODEN = "PACOEJ", DOI = "https://doi.org/10.1016/j.parco.2012.03.002", ISSN = "0167-8191 (print), 1872-7336 (electronic)", ISSN-L = "0167-8191", bibdate = "Sun May 20 09:14:24 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib; http://www.sciencedirect.com/science/journal/01678191", URL = "http://www.sciencedirect.com/science/article/pii/S0167819112000191", acknowledgement = ack-nhfb, fjournal = "Parallel Computing", journal-URL = "http://www.sciencedirect.com/science/journal/01678191", } @Article{Oh:2012:MTS, author = "Doohwan Oh and Won W. Ro", title = "Multi-Threading and Suffix Grouping on Massive Multiple Pattern Matching Algorithm", journal = j-COMP-J, volume = "55", number = "11", pages = "1331--1346", month = nov, year = "2012", CODEN = "CMPJA6", DOI = "https://doi.org/10.1093/comjnl/bxs002", ISSN = "0010-4620 (print), 1460-2067 (electronic)", ISSN-L = "0010-4620", bibdate = "Thu Nov 1 11:25:36 MDT 2012", bibsource = "http://comjnl.oxfordjournals.org/content/55/11.toc; https://www.math.utah.edu/pub/tex/bib/compj2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://comjnl.oxfordjournals.org/content/55/11/1331.full.pdf+html", acknowledgement = ack-nhfb, fjournal = "The Computer Journal", journal-URL = "http://comjnl.oxfordjournals.org/", onlinedate = "February 2, 2012", } @InProceedings{Olivier:2012:CMW, author = "Stephen L. Olivier and Bronis R. de Supinski and Martin Schulz and Jan F. Prins", title = "Characterizing and mitigating work time inflation in task parallel programs", crossref = "Hollingsworth:2012:SPI", pages = "65:1--65:12", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/pvm.bib; https://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a066.pdf", abstract = "Task parallelism raises the level of abstraction in shared memory parallel programming to simplify the development of complex applications. However, task parallel applications can exhibit poor performance due to thread idleness, scheduling overheads, and work time inflation --- additional time spent by threads in a multithreaded computation beyond the time required to perform the same work in a sequential computation. We identify the contributions of each factor to lost efficiency in various task parallel OpenMP applications and diagnose the causes of work time inflation in those applications. Increased data access latency can cause significant work time inflation in NUMA systems. Our locality framework for task parallel OpenMP programs mitigates this cause of work time inflation. Our extensions to the Qthreads library demonstrate that locality-aware scheduling can improve performance up to 3X compared to the Intel OpenMP task scheduler.", acknowledgement = ack-nhfb, articleno = "65", } @InProceedings{Preissl:2012:CSS, author = "Robert Preissl and Theodore M. Wong and Pallab Datta and Myron Flickner and Raghavendra Singh and Steven K. Esser and William P. Risk and Horst D. Simon and Dharmendra S. Modha", title = "{Compass}: a scalable simulator for an architecture for cognitive computing", crossref = "Hollingsworth:2012:SPI", pages = "54:1--54:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/pvm.bib; https://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a085.pdf", abstract = "Inspired by the function, power, and volume of the organic brain, we are developing TrueNorth, a novel modular, non-von Neumann, ultra-low power, compact architecture. TrueNorth consists of a scalable network of neurosynaptic cores, with each core containing neurons, dendrites, synapses, and axons. To set sail for TrueNorth, we developed Compass, a multi-threaded, massively parallel functional simulator and a parallel compiler that maps a network of long-distance pathways in the macaque monkey brain to TrueNorth. We demonstrate near-perfect weak scaling on a 16 rack IBM\reg{} Blue Gene\reg{}/Q (262144 CPUs, 256 TB memory), achieving an unprecedented scale of 256 million neurosynaptic cores containing 65 billion neurons and 16 trillion synapses running only 388X slower than real time with an average spiking rate of 8.1 Hz. By using emerging PGAS communication primitives, we also demonstrate 2X better real-time performance over MPI primitives on a 4 rack Blue Gene/P (16384 CPUs, 16 TB memory).", acknowledgement = ack-nhfb, articleno = "54", } @Article{Pusukuri:2012:TTD, author = "Kishore Kumar Pusukuri and Rajiv Gupta and Laxmi N. Bhuyan", title = "Thread Tranquilizer: Dynamically reducing performance variation", journal = j-TACO, volume = "8", number = "4", pages = "46:1--46:??", month = jan, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2086696.2086725", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 21 07:49:49 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "To realize the performance potential of multicore systems, we must effectively manage the interactions between memory reference behavior and the operating system policies for thread scheduling and migration decisions. We observe that these interactions lead to significant variations in the performance of a given application, from one execution to the next, even when the program input remains unchanged and no other applications are being run on the system. Our experiments with multithreaded programs, including the TATP database application, SPECjbb2005, and a subset of PARSEC and SPEC OMP programs, on a 24-core Dell PowerEdge R905 server running OpenSolaris confirms the above observation.", acknowledgement = ack-nhfb, articleno = "46", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924", } @Article{Quintana-Orti:2012:RSP, author = "Gregorio Quintana-Ort{\'\i} and Francisco D. Igual and Mercedes Marqu{\'e}s and Enrique S. Quintana-Ort{\'\i} and Robert A. van de Geijn", title = "A Runtime System for Programming Out-of-Core Matrix Algorithms-by-Tiles on Multithreaded Architectures", journal = j-TOMS, volume = "38", number = "4", pages = "25:1--25:25", month = aug, year = "2012", CODEN = "ACMSCU", DOI = "https://doi.org/10.1145/2331130.2331133", ISSN = "0098-3500 (print), 1557-7295 (electronic)", ISSN-L = "0098-3500", bibdate = "Thu Aug 30 18:55:10 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/toms.bib", abstract = "Out-of-core implementations of algorithms for dense matrix computations have traditionally focused on optimal use of memory so as to minimize I/O, often trading programmability for performance. In this article we show how the current state of hardware and software allows the programmability problem to be addressed without sacrificing performance. This comes from the realizations that memory is cheap and large, making it less necessary to optimally orchestrate I/O, and that new algorithms view matrices as collections of submatrices and computation as operations with those submatrices. This enables libraries to be coded at a high level of abstraction, leaving the tasks of scheduling the computations and data movement in the hands of a runtime system. This is in sharp contrast to more traditional approaches that leverage optimal use of in-core memory and, at the expense of introducing considerable programming complexity, explicit overlap of I/O with computation. Performance is demonstrated for this approach on multicore architectures as well as platforms equipped with hardware accelerators.", acknowledgement = ack-nhfb, articleno = "25", fjournal = "ACM Transactions on Mathematical Software (TOMS)", journal-URL = "http://dl.acm.org/pub.cfm?id=J782", } @Article{Radojkovic:2012:EIS, author = "Petar Radojkovi{\'c} and Sylvain Girbal and Arnaud Grasset and Eduardo Qui{\~n}ones and Sami Yehia and Francisco J. Cazorla", title = "On the evaluation of the impact of shared resources in multithreaded {COTS} processors in time-critical environments", journal = j-TACO, volume = "8", number = "4", pages = "34:1--34:??", month = jan, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2086696.2086713", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 21 07:49:49 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Commercial Off-The-Shelf (COTS) processors are now commonly used in real-time embedded systems. The characteristics of these processors fulfill system requirements in terms of time-to-market, low cost, and high performance-per-watt ratio. However, multithreaded (MT) processors are still not widely used in real-time systems because the timing analysis is too complex. In MT processors, simultaneously-running tasks share and compete for processor resources, so the timing analysis has to estimate the possible impact that the inter-task interferences have on the execution time of the applications. In this paper, we propose a method that quantifies the slowdown that simultaneously-running tasks may experience due to collision in shared processor resources.", acknowledgement = ack-nhfb, articleno = "34", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924", } @Article{Radojkovic:2012:OTA, author = "Petar Radojkovi{\'c} and Vladimir Cakarevi{\'c} and Miquel Moret{\'o} and Javier Verd{\'u} and Alex Pajuelo and Francisco J. Cazorla and Mario Nemirovsky and Mateo Valero", title = "Optimal task assignment in multithreaded processors: a statistical approach", journal = j-COMP-ARCH-NEWS, volume = "40", number = "1", pages = "235--248", month = mar, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2189750.2151002", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Jun 1 17:06:46 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ASPLOS '12 conference proceedings.", abstract = "The introduction of massively multithreaded (MMT) processors, comprised of a large number of cores with many shared resources, has made task scheduling, in particular task to hardware thread assignment, one of the most promising ways to improve system performance. However, finding an optimal task assignment for a workload running on MMT processors is an NP-complete problem. Due to the fact that the performance of the best possible task assignment is unknown, the room for improvement of current task-assignment algorithms cannot be determined. This is a major problem for the industry because it could lead to: (1)~A waste of resources if excessive effort is devoted to improving a task assignment algorithm that already provides a performance that is close to the optimal one, or (2)~significant performance loss if insufficient effort is devoted to improving poorly-performing task assignment algorithms. In this paper, we present a method based on Extreme Value Theory that allows the prediction of the performance of the optimal task assignment in MMT processors. We further show that executing a sample of several hundred or several thousand random task assignments is enough to obtain, with very high confidence, an assignment with a performance that is close to the optimal one. We validate our method with an industrial case study for a set of multithreaded network applications running on an UltraSPARC~T2 processor.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Reda:2012:APC, author = "Sherief Reda and Ryan Cochran and Ayse K. Coskun", title = "Adaptive Power Capping for Servers with Multithreaded Workloads", journal = j-IEEE-MICRO, volume = "32", number = "5", pages = "64--75", month = sep # "\slash " # oct, year = "2012", CODEN = "IEMIDZ", DOI = "https://doi.org/10.1109/MM.2012.59", ISSN = "0272-1732 (print), 1937-4143 (electronic)", ISSN-L = "0272-1732", bibdate = "Thu Nov 15 05:59:33 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeemicro.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Micro", journal-URL = "http://www.computer.org/csdl/mags/mi/index.html", } @Article{Rivara:2012:MPL, author = "Maria-Cecilia Rivara and Pedro Rodriguez and Rafael Montenegro and Gaston Jorquera", title = "Multithread parallelization of {Lepp}-bisection algorithms", journal = j-APPL-NUM-MATH, volume = "62", number = "4", pages = "473--488", month = apr, year = "2012", CODEN = "ANMAEL", DOI = "https://doi.org/10.1016/j.apnum.2011.07.011", ISSN = "0168-9274 (print), 1873-5460 (electronic)", ISSN-L = "0168-9274", bibdate = "Thu Mar 8 07:24:47 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/applnummath.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.sciencedirect.com/science/journal/01689274", URL = "http://www.sciencedirect.com/science/article/pii/S0168927411001292", acknowledgement = ack-nhfb, fjournal = "Applied Numerical Mathematics", journal-URL = "http://www.sciencedirect.com/science/journal/01689274", } @Article{Sartor:2012:EMT, author = "Jennfer B. Sartor and Lieven Eeckhout", title = "Exploring multi-threaded {Java} application performance on multicore hardware", journal = j-SIGPLAN, volume = "47", number = "10", pages = "281--296", month = oct, year = "2012", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2398857.2384638", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Thu Nov 15 16:40:23 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", abstract = "While there have been many studies of how to schedule applications to take advantage of increasing numbers of cores in modern-day multicore processors, few have focused on multi-threaded managed language applications which are prevalent from the embedded to the server domain. Managed languages complicate performance studies because they have additional virtual machine threads that collect garbage and dynamically compile, closely interacting with application threads. Further complexity is introduced as modern multicore machines have multiple sockets and dynamic frequency scaling options, broadening opportunities to reduce both power and running time. In this paper, we explore the performance of Java applications, studying how best to map application and virtual machine (JVM) threads to a multicore, multi-socket environment. We explore both the cost of separating JVM threads from application threads, and the opportunity to speed up or slow down the clock frequency of isolated threads. We perform experiments with the multi-threaded DaCapo benchmarks and pseudojbb2005 running on the Jikes Research Virtual Machine, on a dual-socket, 8-core Intel Nehalem machine to reveal several novel, and sometimes counter-intuitive, findings. We believe these insights are a first but important step towards understanding and optimizing managed language performance on modern hardware.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "OOPSLA '12 conference proceedings.", } @Article{Sharafeddine:2012:DOE, author = "Mageda Sharafeddine and Komal Jothi and Haitham Akkary", title = "Disjoint out-of-order execution processor", journal = j-TACO, volume = "9", number = "3", pages = "19:1--19:??", month = sep, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2355585.2355592", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Oct 22 10:48:53 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "High-performance superscalar architectures used to exploit instruction level parallelism in single-thread applications have become too complex and power hungry for the multicore processors era. We propose a new architecture that uses multiple small latency-tolerant out-of-order cores to improve single-thread performance. Improving single-thread performance with multiple small out-of-order cores allows designers to place more of these cores on the same die. Consequently, emerging highly parallel applications can take full advantage of the multicore parallel hardware without sacrificing performance of inherently serial and hard to parallelize applications. Our architecture combines speculative multithreading (SpMT) with checkpoint recovery and continual flow pipeline architectures. It splits single-thread program execution into disjoint control and data threads that execute concurrently on multiple cooperating small and latency-tolerant out-of-order cores. Hence we call this style of execution Disjoint Out-of-Order Execution (DOE). DOE uses latency tolerance to overcome performance issues of SpMT caused by interthread data dependences. To evaluate this architecture, we have developed a microarchitecture performance model of DOE based on PTLSim, a simulation infrastructure of the x86 instruction set architecture. We evaluate the potential performance of DOE processor architecture using a simple heuristic to fork control independent threads in hardware at the target addresses of future procedure return instructions. Using applications from SpecInt 2000, we study DOE under ideal as well as realistic architectural constraints. We discuss the performance impact of key DOE architecture and application variables such as number of cores, interthread data dependences, intercore data communication delay, buffers capacity, and branch mispredictions. Without any DOE specific compiler optimizations, our results show that DOE outperforms conventional SpMT architectures by 15\%, on average. We also show that DOE with four small cores can perform on average equally well to a large superscalar core, consuming about the same power. Most importantly, DOE improves throughput performance by a significant amount over a large superscalar core, up to 2.5 times, when running multitasking applications.", acknowledgement = ack-nhfb, articleno = "19", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924", } @Article{Shirole:2012:TCU, author = "Mahesh Shirole and Rajeev Kumar", title = "Testing for concurrency in {UML} diagrams", journal = j-SIGSOFT, volume = "37", number = "5", pages = "1--8", month = sep, year = "2012", CODEN = "SFENDP", DOI = "https://doi.org/10.1145/2347696.2347712", ISSN = "0163-5948 (print), 1943-5843 (electronic)", ISSN-L = "0163-5948", bibdate = "Wed Aug 1 17:16:16 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigsoft2010.bib", abstract = "Concurrent programming is increasingly being used in many applications with the advent of multi-cores. The necessary support for execution of multi-threading is getting richer. Notwithstanding, a concurrent program may behave nondeterministically, it may result in different outputs with the same input in different runs. The aim of this study is to generate test sequences for concurrency from unified modelling language (UML) behavioral models such as sequence and activity diagrams. Generating exhaustive test cases for all concurrent interleaving sequences is exponential in size. Therefore, it is necessary to find adequate test cases in presence of concurrency to uncover errors due to, e.g., data race, synchronization and deadlocks. In order to generate adequate test cases a novel search algorithm, which we call concurrent queue search (CQS) is proposed. The CQS handles random nature of concurrent tasks. To generate test scenarios, a sequence diagram is converted into an activity diagram. An activity diagram encapsulates sequential, conditional, iterative and concurrent ows of the control. By the experimental results, it was observed that test sequences generated by CQS algorithm are superior as compared to DFS and BFS search algorithms.", acknowledgement = ack-nhfb, fjournal = "ACM SIGSOFT Software Engineering Notes", journal-URL = "https://dl.acm.org/citation.cfm?id=J728", } @Article{Singh:2012:EES, author = "Abhayendra Singh and Satish Narayanasamy and Daniel Marino and Todd Millstein and Madanlal Musuvathi", title = "End-to-end sequential consistency", journal = j-COMP-ARCH-NEWS, volume = "40", number = "3", pages = "524--535", month = jun, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2366231.2337220", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Sep 6 10:21:07 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ISCA '12 conference proceedings.", abstract = "Sequential consistency (SC) is arguably the most intuitive behavior for a shared-memory multithreaded program. It is widely accepted that language-level SC could significantly improve programmability of a multiprocessor system. However, efficiently supporting end-to-end SC remains a challenge as it requires that both compiler and hardware optimizations preserve SC semantics. While a recent study has shown that a compiler can preserve SC semantics for a small performance cost, an efficient and complexity-effective SC hardware remains elusive. Past hardware solutions relied on aggressive speculation techniques, which has not yet been realized in a practical implementation. This paper exploits the observation that hardware need not enforce any memory model constraints on accesses to thread-local and shared read-only locations. A processor can easily determine a large fraction of these safe accesses with assistance from static compiler analysis and the hardware memory management unit. We discuss a low-complexity hardware design that exploits this information to reduce the overhead in ensuring SC. Our design employs an additional unordered store buffer for fast-tracking thread-local stores and allowing later memory accesses to proceed without a memory ordering related stall. Our experimental study shows that the cost of guaranteeing end-to-end SC is only 6.2\% on average when compared to a system with TSO hardware executing a stock compiler's output.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Suito:2012:DRM, author = "Kazutoshi Suito and Rikuhei Ueda and Kei Fujii and Takuma Kogo and Hiroki Matsutani and Nobuyuki Yamasaki", title = "The Dependable Responsive Multithreaded Processor for Distributed Real-Time Systems", journal = j-IEEE-MICRO, volume = "32", number = "6", pages = "52--61", month = nov # "\slash " # dec, year = "2012", CODEN = "IEMIDZ", DOI = "https://doi.org/10.1109/MM.2012.88", ISSN = "0272-1732 (print), 1937-4143 (electronic)", ISSN-L = "0272-1732", bibdate = "Thu Dec 13 15:52:22 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeemicro.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Micro", journal-URL = "http://www.computer.org/csdl/mags/mi/index.html", } @Article{Terechko:2012:BPS, author = "Andrei Terechko and Jan Hoogerbrugge and Ghiath Alkadi and Surendra Guntur and Anirban Lahiri and Marc Duranton and Clemens W{\"u}st and Phillip Christie and Axel Nackaerts and Aatish Kumar", title = "Balancing Programmability and Silicon Efficiency of Heterogeneous Multicore Architectures", journal = j-TECS, volume = "11S", number = "1", pages = "14:1--14:??", year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2180887.2180890", ISSN = "1539-9087 (print), 1558-3465 (electronic)", ISSN-L = "1539-9087", bibdate = "Thu Jun 7 16:18:52 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/tecs.bib", abstract = "Multicore architectures provide scalable performance with a lower hardware design effort than single core processors. Our article presents a design methodology and an embedded multicore architecture, focusing on reducing the software design complexity and boosting the performance density. First, we analyze characteristics of the Task-Level Parallelism in modern multimedia workloads. These characteristics are used to formulate requirements for the programming model. Then we translate the programming model requirements to an architecture specification, including a novel low-complexity implementation of cache coherence and a hardware synchronization unit. Our evaluation demonstrates that the novel coherence mechanism substantially simplifies hardware design, while reducing the performance by less than 18\% relative to a complex snooping technique. Compared to a single processor core, the multicores have already proven to be more area- and energy-efficient. However, the multicore architectures in embedded systems still compete with highly efficient function-specific hardware accelerators. In this article we identify five architectural methods to boost performance density of multicores; microarchitectural downscaling, asymmetric multicore architectures, multithreading, generic accelerators, and conjoining. Then, we present a novel methodology to explore multicore design spaces, including the architectural methods improving the performance density. The methodology is based on a complex formula computing performances of heterogeneous multicore systems. Using this design space exploration methodology for HD and QuadHD H.264 video decoding, we estimate that the required areas of multicores in CMOS 45 nm are 2.5 mm$^2$ and 8.6 mm$^2$, respectively. These results suggest that heterogeneous multicores are cost-effective for embedded applications and can provide a good programmability support.", acknowledgement = ack-nhfb, articleno = "14", fjournal = "ACM Transactions on Embedded Computing Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J840", } @Article{Tumeo:2012:DNG, author = "Antonino Tumeo and Simone Secchi and Oreste Villa", title = "Designing Next-Generation Massively Multithreaded Architectures for Irregular Applications", journal = j-COMPUTER, volume = "45", number = "8", pages = "53--61", month = aug, year = "2012", CODEN = "CPTRB4", DOI = "https://doi.org/10.1109/MC.2012.193", ISSN = "0018-9162 (print), 1558-0814 (electronic)", ISSN-L = "0018-9162", bibdate = "Wed Aug 29 16:38:07 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/computer2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Computer", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2", } @Article{Villa:2012:FAS, author = "Oreste Villa and Antonino Tumeo and Simone Secchi and Joseph B. Manzano", title = "Fast and Accurate Simulation of the {Cray XMT} Multithreaded Supercomputer", journal = j-IEEE-TRANS-PAR-DIST-SYS, volume = "23", number = "12", pages = "2266--2279", month = dec, year = "2012", CODEN = "ITDSEO", DOI = "https://doi.org/10.1109/TPDS.2012.70", ISSN = "1045-9219 (print), 1558-2183 (electronic)", ISSN-L = "1045-9219", bibdate = "Thu Nov 15 06:27:40 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/super.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Parallel and Distributed Systems", journal-URL = "http://www.computer.org/tpds/archives.htm", } @Article{Vitali:2012:LSO, author = "Roberto Vitali and Alessandro Pellegrini and Francesco Quaglia", title = "Load sharing for optimistic parallel simulations on multi core machines", journal = j-SIGMETRICS, volume = "40", number = "3", pages = "2--11", month = dec, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2425248.2425250", ISSN = "0163-5999 (print), 1557-9484 (electronic)", ISSN-L = "0163-5999", bibdate = "Sun May 5 09:58:20 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigmetrics.bib", abstract = "Parallel Discrete Event Simulation (PDES) is based on the partitioning of the simulation model into distinct Logical Processes (LPs), each one modeling a portion of the entire system, which are allowed to execute simulation events concurrently. This allows exploiting parallel computing architectures to speedup model execution, and to make very large models tractable. In this article we cope with the optimistic approach to PDES, where LPs are allowed to concurrently process their events in a speculative fashion, and rollback/ recovery techniques are used to guarantee state consistency in case of causality violations along the speculative execution path. Particularly, we present an innovative load sharing approach targeted at optimizing resource usage for fruitful simulation work when running an optimistic PDES environment on top of multi-processor/multi-core machines. Beyond providing the load sharing model, we also define a load sharing oriented architectural scheme, based on a symmetric multi-threaded organization of the simulation platform. Finally, we present a real implementation of the load sharing architecture within the open source ROme OpTimistic Simulator (ROOT-Sim) package. Experimental data for an assessment of both viability and effectiveness of our proposal are presented as well.", acknowledgement = ack-nhfb, fjournal = "ACM SIGMETRICS Performance Evaluation Review", journal-URL = "http://portal.acm.org/toc.cfm?id=J618", } @Article{Volos:2012:ATM, author = "Haris Volos and Andres Jaan Tack and Michael M. Swift and Shan Lu", title = "Applying transactional memory to concurrency bugs", journal = j-COMP-ARCH-NEWS, volume = "40", number = "1", pages = "211--222", month = mar, year = "2012", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2189750.2150999", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Jun 1 17:06:46 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ASPLOS '12 conference proceedings.", abstract = "Multithreaded programs often suffer from synchronization bugs such as atomicity violations and deadlocks. These bugs arise from complicated locking strategies and ad hoc synchronization methods to avoid the use of locks. A survey of the bug databases of major open-source applications shows that concurrency bugs often take multiple fix attempts, and that fixes often introduce yet more concurrency bugs. Transactional memory (TM) enables programmers to declare regions of code atomic without specifying a lock and has the potential to avoid these bugs. Where most previous studies have focused on using TM to write new programs from scratch, we consider its utility in fixing existing programs with concurrency bugs. We therefore investigate four methods of using TM on three concurrent programs. Overall, we find that 29\% of the bugs are not fixable by transactional memory, showing that TM does not address many important types of concurrency bugs. In particular, TM works poorly with extremely long critical sections and with deadlocks involving both condition variables and I/O. Conversely, we find that for 56\% of the bugs, transactional memory offers demonstrable value by simplifying the reasoning behind a fix or the effort to implement a fix, and using transactions in the first place would have avoided 71\% of the bugs examined. We also find that ad hoc synchronization put in place to avoid the overhead of locking can be greatly simplified with TM, but requires hardware support to perform well.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Wei:2012:OLL, author = "Zheng Wei and Joseph Jaja", title = "Optimization of Linked List Prefix Computations on Multithreaded {GPUs} Using {CUDA}", journal = j-PARALLEL-PROCESS-LETT, volume = "22", number = "4", pages = "1250012", month = dec, year = "2012", CODEN = "PPLTEE", DOI = "https://doi.org/10.1142/S0129626412500120", ISSN = "0129-6264 (print), 1793-642X (electronic)", ISSN-L = "0129-6264", bibdate = "Sat Jun 22 15:54:17 MDT 2013", bibsource = "http://ejournals.wspc.com.sg/ppl/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/parallelprocesslett.bib", acknowledgement = ack-nhfb, fjournal = "Parallel Processing Letters", journal-URL = "http://www.worldscientific.com/loi/ppl", } @Article{Wu:2012:SPA, author = "Jingyue Wu and Yang Tang and Gang Hu and Heming Cui and Junfeng Yang", title = "Sound and precise analysis of parallel programs through schedule specialization", journal = j-SIGPLAN, volume = "47", number = "6", pages = "205--216", month = jun, year = "2012", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2345156.2254090", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Mon Aug 6 16:31:49 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "PLDI '12 proceedings.", abstract = "Parallel programs are known to be difficult to analyze. A key reason is that they typically have an enormous number of execution interleavings, or schedules. Static analysis over all schedules requires over-approximations, resulting in poor precision; dynamic analysis rarely covers more than a tiny fraction of all schedules. We propose an approach called schedule specialization to analyze a parallel program over only a small set of schedules for precision, and then enforce these schedules at runtime for soundness of the static analysis results. We build a schedule specialization framework for C/C++ multithreaded programs that use Pthreads. Our framework avoids the need to modify every analysis to be schedule-aware by specializing a program into a simpler program based on a schedule, so that the resultant program can be analyzed with stock analyses for improved precision. Moreover, our framework provides a precise schedule-aware def-use analysis on memory locations, enabling us to build three highly precise analyses: an alias analyzer, a data-race detector, and a path slicer. Evaluation on 17 programs, including 2 real-world programs and 15 popular benchmarks, shows that analyses using our framework reduced may-aliases by 61.9\%, false race reports by 69\%, and path slices by 48.7\%; and detected 7 unknown bugs in well-checked programs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Xekalakis:2012:MSM, author = "Polychronis Xekalakis and Nikolas Ioannou and Marcelo Cintra", title = "Mixed speculative multithreaded execution models", journal = j-TACO, volume = "9", number = "3", pages = "18:1--18:??", month = sep, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2355585.2355591", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Oct 22 10:48:53 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The current trend toward multicore architectures has placed great pressure on programmers and compilers to generate thread-parallel programs. Improved execution performance can no longer be obtained via traditional single-thread instruction level parallelism (ILP), but, instead, via multithreaded execution. One notable technique that facilitates the extraction of parallel threads from sequential applications is thread-level speculation (TLS). This technique allows programmers/compilers to generate threads without checking for inter-thread data and control dependences, which are then transparently enforced by the hardware. Most prior work on TLS has concentrated on thread selection and mechanisms to efficiently support the main TLS operations, such as squashes, data versioning, and commits. This article seeks to enhance TLS functionality by combining it with other speculative multithreaded execution models. The main idea is that TLS already requires extensive hardware support, which when slightly augmented can accommodate other speculative multithreaded techniques. Recognizing that for different applications, or even program phases, the application bottlenecks may be different, it is reasonable to assume that the more versatile a system is, the more efficiently it will be able to execute the given program. Toward this direction, we first show that mixed execution models that combine TLS with Helper Threads (HT), RunAhead execution (RA) and MultiPath execution (MP) perform better than any of the models alone. Based on a simple model that we propose, we show that benefits come from being able to extract additional ILP without harming the TLP extracted by TLS. We then show that by combining all the execution models in a unified one that combines all these speculative multithreaded models, ILP can be further enhanced with only minimal additional cost in hardware.", acknowledgement = ack-nhfb, articleno = "18", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924", } @Article{Xue:2012:RJC, author = "Jingling Xue", title = "Rethinking {Java} call stack design for tiny embedded devices", journal = j-SIGPLAN, volume = "47", number = "5", pages = "1--10", month = may, year = "2012", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2345141.2248420", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Mon Aug 6 16:31:46 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "LCTES '12 proceedings.", abstract = "The ability of tiny embedded devices to run large feature-rich programs is typically constrained by the amount of memory installed on such devices. Furthermore, the useful operation of these devices in wireless sensor applications is limited by their battery life. This paper presents a call stack redesign targeted at an efficient use of RAM storage and CPU cycles by a Java program running on a wireless sensor mote. Without compromising the application programs, our call stack redesign saves 30\% of RAM, on average, evaluated over a large number of benchmarks. On the same set of bench-marks, our design also avoids frequent RAM allocations and deallocations, resulting in average 80\% fewer memory operations and 23\% faster program execution. These may be critical improvements for tiny embedded devices that are equipped with small amount of RAM and limited battery life. However, our call stack redesign is equally effective for any complex multi-threaded object oriented program developed for desktop computers. We describe the redesign, measure its performance and report the resulting savings in RAM and execution time for a wide variety of programs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Yamashita:2012:APS, author = "Makoto Yamashita and Katsuki Fujisawa and Mituhiro Fukuda and Kazuhide Nakata and Maho Nakata", title = "{Algorithm 925}: Parallel Solver for Semidefinite Programming Problem having Sparse {Schur} Complement Matrix", journal = j-TOMS, volume = "39", number = "1", pages = "6:1--6:22", month = nov, year = "2012", CODEN = "ACMSCU", DOI = "https://doi.org/10.1145/2382585.2382591", ISSN = "0098-3500 (print), 1557-7295 (electronic)", ISSN-L = "0098-3500", bibdate = "Thu Dec 6 07:36:30 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/toms.bib", abstract = "A SemiDefinite Programming (SDP) problem is one of the most central problems in mathematical optimization. SDP provides an effective computation framework for many research fields. Some applications, however, require solving a large-scale SDP whose size exceeds the capacity of a single processor both in terms of computation time and available memory. SDPARA (SemiDefinite Programming Algorithm paRAllel package) [Yamashita et al. 2003b] was designed to solve such large-scale SDPs. Its parallel performance is outstanding for general SDPs in most cases. However, the parallel implementation is less successful for some sparse SDPs obtained from applications such as Polynomial Optimization Problems (POPs) or Sensor Network Localization (SNL) problems, since this version of SDPARA cannot directly handle sparse Schur Complement Matrices (SCMs). In this article we improve SDPARA by focusing on the sparsity of the SCM and we propose a new parallel implementation using the formula-cost-based distribution along with a replacement of the dense Cholesky factorization. We verify numerically that these features are key to solving SDPs with sparse SCMs more quickly on parallel computing systems. The performance is further enhanced by multithreading and the new SDPARA attains considerable scalability in general. It also finds solutions for extremely large-scale SDPs arising from POPs which cannot be obtained by other solvers.", acknowledgement = ack-nhfb, articleno = "6", fjournal = "ACM Transactions on Mathematical Software (TOMS)", journal-URL = "http://dl.acm.org/pub.cfm?id=J782", } @Article{Yu:2012:MCD, author = "Jie Yu and Satish Narayanasamy and Cristiano Pereira and Gilles Pokam", title = "{Maple}: a coverage-driven testing tool for multithreaded programs", journal = j-SIGPLAN, volume = "47", number = "10", pages = "485--502", month = oct, year = "2012", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2398857.2384651", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Thu Nov 15 16:40:23 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "Testing multithreaded programs is a hard problem, because it is challenging to expose those rare interleavings that can trigger a concurrency bug. We propose a new thread interleaving coverage-driven testing tool called Maple that seeks to expose untested thread interleavings as much as possible. It memoizes tested interleavings and actively seeks to expose untested interleavings for a given test input to increase interleaving coverage. We discuss several solutions to realize the above goal. First, we discuss a coverage metric based on a set of interleaving idioms. Second, we discuss an online technique to predict untested interleavings that can potentially be exposed for a given test input. Finally, the predicted untested interleavings are exposed by actively controlling the thread schedule while executing for the test input. We discuss our experiences in using the tool to expose several known and unknown bugs in real-world applications such as Apache and MySQL.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "OOPSLA '12 conference proceedings.", } @Article{Zhang:2012:SCC, author = "Eddy Zheng Zhang and Yunlian Jiang and Xipeng Shen", title = "The Significance of {CMP} Cache Sharing on Contemporary Multithreaded Applications", journal = j-IEEE-TRANS-PAR-DIST-SYS, volume = "23", number = "2", pages = "367--374", month = feb, year = "2012", CODEN = "ITDSEO", DOI = "https://doi.org/10.1109/TPDS.2011.130", ISSN = "1045-9219 (print), 1558-2183 (electronic)", ISSN-L = "1045-9219", bibdate = "Thu Mar 01 14:47:13 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Parallel and Distributed Systems", journal-URL = "http://www.computer.org/tpds/archives.htm", } @Article{Zhuravlev:2012:SST, author = "Sergey Zhuravlev and Juan Carlos Saez and Sergey Blagodurov and Alexandra Fedorova and Manuel Prieto", title = "Survey of scheduling techniques for addressing shared resources in multicore processors", journal = j-COMP-SURV, volume = "45", number = "1", pages = "4:1--4:??", month = nov, year = "2012", CODEN = "CMSVAN", DOI = "https://doi.org/10.1145/2379776.2379780", ISSN = "0360-0300 (print), 1557-7341 (electronic)", ISSN-L = "0360-0300", bibdate = "Thu Dec 6 10:55:59 MST 2012", bibsource = "http://www.acm.org/pubs/contents/journals/surveys/; https://www.math.utah.edu/pub/tex/bib/compsurv.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Chip multicore processors (CMPs) have emerged as the dominant architecture choice for modern computing platforms and will most likely continue to be dominant well into the foreseeable future. As with any system, CMPs offer a unique set of challenges. Chief among them is the shared resource contention that results because CMP cores are not independent processors but rather share common resources among cores such as the last level cache (LLC). Shared resource contention can lead to severe and unpredictable performance impact on the threads running on the CMP. Conversely, CMPs offer tremendous opportunities for multithreaded applications, which can take advantage of simultaneous thread execution as well as fast inter thread data sharing. Many solutions have been proposed to deal with the negative aspects of CMPs and take advantage of the positive. This survey focuses on the subset of these solutions that exclusively make use of OS thread-level scheduling to achieve their goals. These solutions are particularly attractive as they require no changes to hardware and minimal or no changes to the OS. The OS scheduler has expanded well beyond its original role of time-multiplexing threads on a single core into a complex and effective resource manager. This article surveys a multitude of new and exciting work that explores the diverse new roles the OS scheduler can successfully take on.", acknowledgement = ack-nhfb, articleno = "4", fjournal = "ACM Computing Surveys", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J204", } @Article{Beckert:2013:DLD, author = "Bernhard Beckert and Vladimir Klebanov", title = "A {Dynamic Logic} for deductive verification of multi-threaded programs", journal = j-FORM-ASP-COMPUT, volume = "25", number = "3", pages = "405--437", month = may, year = "2013", CODEN = "FACME5", DOI = "https://doi.org/10.1007/s00165-012-0261-4", ISSN = "0934-5043 (print), 1433-299X (electronic)", ISSN-L = "0934-5043", bibdate = "Wed Mar 18 05:35:14 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/formaspcomput.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer.com/article/10.1007/s00165-012-0261-4", acknowledgement = ack-nhfb, fjournal = "Formal Aspects of Computing", journal-URL = "http://link.springer.com/journal/165", } @Article{Bergan:2013:ICS, author = "Tom Bergan and Luis Ceze and Dan Grossman", title = "Input-covering schedules for multithreaded programs", journal = j-SIGPLAN, volume = "48", number = "10", pages = "677--692", month = oct, year = "2013", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2544173.2509508", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Mon Dec 9 09:19:33 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", note = "OOPSLA '13 conference proceedings.", abstract = "We propose constraining multithreaded execution to small sets of input-covering schedules, which we define as follows: given a program $P$, we say that a set of schedules $ \Sigma $ covers all inputs of program $P$ if, when given any input, $P$'s execution can be constrained to some schedule in $ \Sigma $ and still produce a semantically valid result. Our approach is to first compute a small $ \Sigma $ for a given program $P$, and then, at runtime, constrain $P$'s execution to always follow some schedule in $ \Sigma $, and never deviate. We have designed an algorithm that uses symbolic execution to systematically enumerate a set of input-covering schedules, $ \Sigma $. To deal with programs that run for an unbounded length of time, we partition execution into bounded epochs, find input-covering schedules for each epoch in isolation, and then piece the schedules together at runtime. We have implemented this algorithm along with a constrained execution runtime for pthreads programs, and we report results Our approach has the following advantage: because all possible runtime schedules are known a priori, we can seek to validate the program by thoroughly verifying each schedule in $ \Sigma $, in isolation, without needing to reason about the huge space of thread interleavings that arises due to conventional nondeterministic execution.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Bois:2013:BGV, author = "Kristof {Du Bois} and Jennifer B. Sartor and Stijn Eyerman and Lieven Eeckhout", title = "Bottle graphs: visualizing scalability bottlenecks in multi-threaded applications", journal = j-SIGPLAN, volume = "48", number = "10", pages = "355--372", month = oct, year = "2013", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2544173.2509529", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Mon Dec 9 09:19:33 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", note = "OOPSLA '13 conference proceedings.", abstract = "Understanding and analyzing multi-threaded program performance and scalability is far from trivial, which severely complicates parallel software development and optimization. In this paper, we present bottle graphs, a powerful analysis tool that visualizes multi-threaded program performance, in regards to both per-thread parallelism and execution time. Each thread is represented as a box, with its height equal to the share of that thread in the total program execution time, its width equal to its parallelism, and its area equal to its total running time. The boxes of all threads are stacked upon each other, leading to a stack with height equal to the total program execution time. Bottle graphs show exactly how scalable each thread is, and thus guide optimization towards those threads that have a smaller parallel component (narrower), and a larger share of the total execution time (taller), i.e. to the 'neck' of the bottle. Using light-weight OS modules, we calculate bottle graphs for unmodified multi-threaded programs running on real processors with an average overhead of 0.68\%. To demonstrate their utility, we do an extensive analysis of 12 Java benchmarks running on top of the Jikes JVM, which introduces many JVM service threads. We not only reveal and explain scalability limitations of several well-known Java benchmarks; we also analyze the reasons why the garbage collector itself does not scale, and in fact performs optimally with two collector threads for all benchmarks, regardless of the number of application threads. Finally, we compare the scalability of Jikes versus the OpenJDK JVM. We demonstrate how useful and intuitive bottle graphs are as a tool to analyze scalability and help optimize multi-threaded applications.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Bond:2013:GDG, author = "Michael Bond", title = "{GPUDet}: a deterministic {GPU} architecture", journal = j-SIGPLAN, volume = "48", number = "4", pages = "1--12", month = apr, year = "2013", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2499368.2451118", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Mon Jul 1 17:15:23 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "Nondeterminism is a key challenge in developing multithreaded applications. Even with the same input, each execution of a multithreaded program may produce a different output. This behavior complicates debugging and limits one's ability to test for correctness. This non-reproducibility situation is aggravated on massively parallel architectures like graphics processing units (GPUs) with thousands of concurrent threads. We believe providing a deterministic environment to ease debugging and testing of GPU applications is essential to enable a broader class of software to use GPUs. Many hardware and software techniques have been proposed for providing determinism on general-purpose multi-core processors. However, these techniques are designed for small numbers of threads. Scaling them to thousands of threads on a GPU is a major challenge. This paper proposes a scalable hardware mechanism, GPUDet, to provide determinism in GPU architectures. In this paper we characterize the existing deterministic and nondeterministic aspects of current GPU execution models, and we use these observations to inform GPUDet's design. For example, GPUDet leverages the inherent determinism of the SIMD hardware in GPUs to provide determinism within a wavefront at no cost. GPUDet also exploits the Z-Buffer Unit, an existing GPU hardware unit for graphics rendering, to allow parallel out-of-order memory writes to produce a deterministic output. Other optimizations in GPUDet include deterministic parallel execution of atomic operations and a workgroup-aware algorithm that eliminates unnecessary global synchronizations. Our simulation results indicate that GPUDet incurs only 2X slowdown on average over a baseline nondeterministic architecture, with runtime overheads as low as 4\% for compute-bound applications, despite running GPU kernels with thousands of threads. We also characterize the sources of overhead for deterministic execution on GPUs to provide insights for further optimizations.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "ASPLOS '13 conference proceedings.", } @Article{Bond:2013:OCC, author = "Michael D. Bond and Milind Kulkarni and Man Cao and Minjia Zhang and Meisam Fathi Salmi and Swarnendu Biswas and Aritra Sengupta and Jipeng Huang", title = "{OCTET}: capturing and controlling cross-thread dependences efficiently", journal = j-SIGPLAN, volume = "48", number = "10", pages = "693--712", month = oct, year = "2013", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2544173.2509519", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Mon Dec 9 09:19:33 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", note = "OOPSLA '13 conference proceedings.", abstract = "Parallel programming is essential for reaping the benefits of parallel hardware, but it is notoriously difficult to develop and debug reliable, scalable software systems. One key challenge is that modern languages and systems provide poor support for ensuring concurrency correctness properties --- atomicity, sequential consistency, and multithreaded determinism --- because all existing approaches are impractical. Dynamic, software-based approaches slow programs by up to an order of magnitude because capturing and controlling cross-thread dependences (i.e., conflicting accesses to shared memory) requires synchronization at virtually every access to potentially shared memory. This paper introduces a new software-based concurrency control mechanism called OCTET that soundly captures cross-thread dependences and can be used to build dynamic analyses for concurrency correctness. OCTET achieves low overheads by tracking the locality state of each potentially shared object. Non-conflicting accesses conform to the locality state and require no synchronization; only conflicting accesses require a state change and heavyweight synchronization. This optimistic tradeoff leads to significant efficiency gains in capturing cross-thread dependences: a prototype implementation of OCTET in a high-performance Java virtual machine slows real-world concurrent programs by only 26\% on average. A dependence recorder, suitable for record {\&} replay, built on top of OCTET adds an additional 5\% overhead on average. These results suggest that OCTET can provide a foundation for developing low-overhead analyses that check and enforce concurrency correctness.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Bouajjani:2013:ARP, author = "Ahmed Bouajjani and Michael Emmi", title = "Analysis of Recursively Parallel Programs", journal = j-TOPLAS, volume = "35", number = "3", pages = "10:1--10:??", month = nov, year = "2013", CODEN = "ATPSDT", DOI = "https://doi.org/10.1145/2518188", ISSN = "0164-0925 (print), 1558-4593 (electronic)", ISSN-L = "0164-0925", bibdate = "Fri Nov 8 17:09:04 MST 2013", bibsource = "http://www.acm.org/pubs/contents/journals/toplas/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/toplas.bib", abstract = "We propose a general formal model of isolated hierarchical parallel computations, and identify several fragments to match the concurrency constructs present in real-world programming languages such as Cilk and X10. By associating fundamental formal models (vector addition systems with recursive transitions) to each fragment, we provide a common platform for exposing the relative difficulties of algorithmic reasoning. For each case we measure the complexity of deciding state reachability for finite-data recursive programs, and propose algorithms for the decidable cases. The complexities which include PTIME, NP, EXPSPACE, and 2EXPTIME contrast with undecidable state reachability for recursive multithreaded programs.", acknowledgement = ack-nhfb, articleno = "10", fjournal = "ACM Transactions on Programming Languages and Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783", } @Article{Buttari:2013:FGM, author = "Alfredo Buttari", title = "Fine-Grained Multithreading for the Multifrontal {$ Q R $} Factorization of Sparse Matrices", journal = j-SIAM-J-SCI-COMP, volume = "35", number = "4", pages = "C323--C345", month = "????", year = "2013", CODEN = "SJOCE3", DOI = "https://doi.org/10.1137/110846427", ISSN = "1064-8275 (print), 1095-7197 (electronic)", ISSN-L = "1064-8275", bibdate = "Fri Jul 19 07:44:01 MDT 2013", bibsource = "http://epubs.siam.org/sam-bin/dbq/toc/SISC/35/4; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/siamjscicomput.bib", acknowledgement = ack-nhfb, fjournal = "SIAM Journal on Scientific Computing", journal-URL = "http://epubs.siam.org/sisc", onlinedate = "January 2013", } @Article{Cabodi:2013:TBM, author = "Gianpiero Cabodi and Sergio Nocco and Stefano Quer", title = "Thread-based multi-engine model checking for multicore platforms", journal = j-TODAES, volume = "18", number = "3", pages = "36:1--36:??", month = jul, year = "2013", CODEN = "ATASFO", DOI = "https://doi.org/10.1145/2491477.2491480", ISSN = "1084-4309 (print), 1557-7309 (electronic)", ISSN-L = "1084-4309", bibdate = "Sat Jul 27 08:09:07 MDT 2013", bibsource = "http://www.acm.org/pubs/contents/journals/todaes/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/todaes.bib", abstract = "This article describes a multithreaded, portfolio-based approach to model checking, where multiple cores are exploited as the underlying computing framework to support concurrent execution of cooperative engines. We introduce a portfolio-based approach to model checking. Our portfolio is first driven by an approximate runtime predictor that provides a heuristic approximation to a perfect oracle and suggests which engines are more suitable for each verification instance. Scalability and robustness of the overall model-checking effort highly rely on a concurrent, multithreaded model of execution. Following similar approaches in related application fields, we dovetail data partitioning, focused on proving several properties in parallel, and engine partitioning, based on concurrent runs of different model-checking engines competing for completion of the same problem. We investigate concurrency not only to effectively exploit several available engines, which operate independently, but also to show that a cooperative effort is possible. In this case, we adopt a straightforward, light-weight, model of inter-engine communication and data sharing. We provide a detailed description of the ideas, algorithms, and experimental results obtained on the benchmarks from the Hardware Model Checking Competition suites (HWMCC'10 and HWMCC'11).", acknowledgement = ack-nhfb, articleno = "36", fjournal = "ACM Transactions on Design Automation of Electronic Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J776", } @Article{Cai:2013:TST, author = "Yan Cai and Ke Zhai and Shangru Wu and W. K. Chan", title = "{TeamWork}: synchronizing threads globally to detect real deadlocks for multithreaded programs", journal = j-SIGPLAN, volume = "48", number = "8", pages = "311--312", month = aug, year = "2013", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2517327.2442560", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Mon Aug 26 13:48:51 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", note = "PPoPP '13 Conference proceedings.", abstract = "This paper presents the aim of TeamWork, our ongoing effort to develop a comprehensive dynamic deadlock confirmation tool for multithreaded programs. It also presents a refined object abstraction algorithm that refines the existing stack hash abstraction.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Cain:2013:RAS, author = "Harold W. Cain and Maged M. Michael and Brad Frey and Cathy May and Derek Williams and Hung Le", title = "Robust architectural support for transactional memory in the {Power} architecture", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "225--236", month = jun, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2508148.2485942", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "On the twentieth anniversary of the original publication [10], following ten years of intense activity in the research literature, hardware support for transactional memory (TM) has finally become a commercial reality, with HTM-enabled chips currently or soon-to-be available from many hardware vendors. In this paper we describe architectural support for TM added to a future version of the Power ISA{\TM}. Two imperatives drove the development: the desire to complement our weakly-consistent memory model with a more friendly interface to simplify the development and porting of multithreaded applications, and the need for robustness beyond that of some early implementations. In the process of commercializing the feature, we had to resolve some previously unexplored interactions between TM and existing features of the ISA, for example translation shootdown, interrupt handling, atomic read-modify-write primitives, and our weakly consistent memory model. We describe these interactions, the overall architecture, and discuss the motivation and rationale for our choices of architectural semantics, beyond what is typically found in reference manuals.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Chung:2013:LBD, author = "Eric S. Chung and John D. Davis and Jaewon Lee", title = "{LINQits}: big data on little clients", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "261--272", month = jun, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2508148.2485945", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "We present LINQits, a flexible hardware template that can be mapped onto programmable logic or ASICs in a heterogeneous system-on-chip for a mobile device or server. Unlike fixed-function accelerators, LINQits accelerates a domain-specific query language called LINQ. LINQits does not provide coverage for all possible applications --- however, existing applications (re-)written with LINQ in mind benefit extensively from hardware acceleration. Furthermore, the LINQits framework offers a graceful and transparent migration path from software to hardware. LINQits is prototyped on a 2W heterogeneous SoC called the ZYNQ processor, which combines dual ARM A9 processors with an FPGA on a single die in 28nm silicon technology. Our physical measurements show that LINQits improves energy efficiency by 8.9 to 30.6 times and performance by 10.7 to 38.1 times compared to optimized, multithreaded C programs running on conventional ARM A9 processors.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Demange:2013:PBB, author = "Delphine Demange and Vincent Laporte and Lei Zhao and Suresh Jagannathan and David Pichardie and Jan Vitek", title = "{Plan B}: a buffered memory model for {Java}", journal = j-SIGPLAN, volume = "48", number = "1", pages = "329--342", month = jan, year = "2013", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2480359.2429110", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Mon Jul 1 17:15:03 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "Recent advances in verification have made it possible to envision trusted implementations of real-world languages. Java with its type-safety and fully specified semantics would appear to be an ideal candidate; yet, the complexity of the translation steps used in production virtual machines have made it a challenging target for verifying compiler technology. One of Java's key innovations, its memory model, poses significant obstacles to such an endeavor. The Java Memory Model is an ambitious attempt at specifying the behavior of multithreaded programs in a portable, hardware agnostic, way. While experts have an intuitive grasp of the properties that the model should enjoy, the specification is complex and not well-suited for integration within a verifying compiler infrastructure. Moreover, the specification is given in an axiomatic style that is distant from the intuitive reordering-based reasonings traditionally used to justify or rule out behaviors, and ill suited to the kind of operational reasoning one would expect to employ in a compiler. This paper takes a step back, and introduces a Buffered Memory Model (BMM) for Java. We choose a pragmatic point in the design space sacrificing generality in favor of a model that is fully characterized in terms of the reorderings it allows, amenable to formal reasoning, and which can be efficiently applied to a specific hardware family, namely x86 multiprocessors. Although the BMM restricts the reorderings compilers are allowed to perform, it serves as the key enabling device to achieving a verification pathway from bytecode to machine instructions. Despite its restrictions, we show that it is backwards compatible with the Java Memory Model and that it does not cripple performance on TSO architectures.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "POPL '13 conference proceedings.", } @Article{DuBois:2013:CSI, author = "Kristof {Du Bois} and Stijn Eyerman and Jennifer B. Sartor and Lieven Eeckhout", title = "Criticality stacks: identifying critical threads in parallel programs using synchronization behavior", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "511--522", month = jun, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2508148.2485966", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "Analyzing multi-threaded programs is quite challenging, but is necessary to obtain good multicore performance while saving energy. Due to synchronization, certain threads make others wait, because they hold a lock or have yet to reach a barrier. We call these critical threads, i.e., threads whose performance is determinative of program performance as a whole. Identifying these threads can reveal numerous optimization opportunities, for the software developer and for hardware. In this paper, we propose a new metric for assessing thread criticality, which combines both how much time a thread is performing useful work and how many co-running threads are waiting. We show how thread criticality can be calculated online with modest hardware additions and with low overhead. We use our metric to create criticality stacks that break total execution time into each thread's criticality component, allowing for easy visual analysis of parallel imbalance. To validate our criticality metric, and demonstrate it is better than previous metrics, we scale the frequency of the most critical thread and show it achieves the largest performance improvement. We then demonstrate the broad applicability of criticality stacks by using them to perform three types of optimizations: (1) program analysis to remove parallel bottlenecks, (2) dynamically identifying the most critical thread and accelerating it using frequency scaling to improve performance, and (3) showing that accelerating only the most critical thread allows for targeted energy reduction.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Ediger:2013:GMA, author = "David Ediger and Karl Jiang and E. Jason Riedy and David A. Bader", title = "{GraphCT}: Multithreaded Algorithms for Massive Graph Analysis", journal = j-IEEE-TRANS-PAR-DIST-SYS, volume = "24", number = "11", pages = "2220--2229", month = nov, year = "2013", CODEN = "ITDSEO", DOI = "https://doi.org/10.1109/TPDS.2012.323", ISSN = "1045-9219 (print), 1558-2183 (electronic)", ISSN-L = "1045-9219", bibdate = "Fri Nov 15 10:31:20 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Parallel and Distributed Systems", journal-URL = "http://www.computer.org/tpds/archives.htm", } @Article{Ferrara:2013:GSA, author = "P. Ferrara", title = "A generic static analyzer for multithreaded {Java} programs", journal = j-SPE, volume = "43", number = "6", pages = "663--684", month = jun, year = "2013", CODEN = "SPEXBL", DOI = "https://doi.org/10.1002/spe.2126", ISSN = "0038-0644 (print), 1097-024X (electronic)", ISSN-L = "0038-0644", bibdate = "Tue Dec 3 10:30:05 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/java2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/spe.bib; http://www3.interscience.wiley.com/journalfinder.html", acknowledgement = ack-nhfb, fjournal = "Software --- Practice and Experience", journal-URL = "http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1097-024X", onlinedate = "9 May 2012", } @Article{Honarmand:2013:CUA, author = "Nima Honarmand and Nathan Dautenhahn and Josep Torrellas and Samuel T. King and Gilles Pokam and Cristiano Pereira", title = "{Cyrus}: unintrusive application-level record-replay for replay parallelism", journal = j-SIGPLAN, volume = "48", number = "4", pages = "193--206", month = apr, year = "2013", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2499368.2451138", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Mon Jul 1 17:15:23 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "Architectures for deterministic record-replay (R\&R) of multithreaded code are attractive for program debugging, intrusion analysis, and fault-tolerance uses. However, very few of the proposed designs have focused on maximizing replay speed --- a key enabling property of these systems. The few efforts that focus on replay speed require intrusive hardware or software modifications, or target whole-system R\&R rather than the more useful application-level R\&R. This paper presents the first hardware-based scheme for unintrusive, application-level R\&R that explicitly targets high replay speed. Our scheme, called Cyrus, requires no modification to commodity snoopy cache coherence. It introduces the concept of an on-the-fly software Backend Pass during recording which, as the log is being generated, transforms it for high replay parallelism. This pass also fixes-up the log, and can flexibly trade-off replay parallelism for log size. We analyze the performance of Cyrus using full system (OS plus hardware) simulation. Our results show that Cyrus has negligible recording overhead. In addition, for 8-processor runs of SPLASH-2, Cyrus attains an average replay parallelism of 5, and a replay speed that is, on average, only about 50\% lower than the recording speed.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "ASPLOS '13 conference proceedings.", } @Article{Huang:2013:CRL, author = "Jeff Huang and Charles Zhang and Julian Dolby", title = "{CLAP}: recording local executions to reproduce concurrency failures", journal = j-SIGPLAN, volume = "48", number = "6", pages = "141--152", month = jun, year = "2013", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2499370.2462167", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Mon Jul 1 17:15:38 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "We present CLAP, a new technique to reproduce concurrency bugs. CLAP has two key steps. First, it logs thread local execution paths at runtime. Second, offline, it computes memory dependencies that accord with the logged execution and are able to reproduce the observed bug. The second step works by combining constraints from the thread paths and constraints based on a memory model, and computing an execution with a constraint solver. CLAP has four major advantages. First, logging purely local execution of each thread is substantially cheaper than logging memory interactions, which enables CLAP to be efficient compared to previous approaches. Second, our logging does not require any synchronization and hence with no added memory barriers or fences; this minimizes perturbation and missed bugs due to extra synchronizations foreclosing certain racy behaviors. Third, since it uses no synchronization, we extend CLAP to work on a range of relaxed memory models, such as TSO and PSO, in addition to sequential consistency. Fourth, CLAP can compute a much simpler execution than the original one, that reveals the bug with minimal thread context switches. To mitigate the scalability issues, we also present an approach to parallelize constraint solving, which theoretically scales our technique to programs with arbitrary execution length. Experimental results on a variety of multithreaded benchmarks and real world concurrent applications validate these advantages by showing that our technique is effective in reproducing concurrency bugs even under relaxed memory models; furthermore, it is significantly more efficient than a state-of-the-art technique that records shared memory dependencies, reducing execution time overhead by 45\% and log size by 88\% on average.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "PLDI '13 conference proceedings.", } @Article{Hunt:2013:DTN, author = "Nicholas Hunt and Tom Bergan and Luis Ceze and Steven D. Gribble", title = "{DDOS}: taming nondeterminism in distributed systems", journal = j-SIGPLAN, volume = "48", number = "4", pages = "499--508", month = apr, year = "2013", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2499368.2451170", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Mon Jul 1 17:15:23 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "Nondeterminism complicates the development and management of distributed systems, and arises from two main sources: the local behavior of each individual node as well as the behavior of the network connecting them. Taming nondeterminism effectively requires dealing with both sources. This paper proposes DDOS, a system that leverages prior work on deterministic multithreading to offer: (1) space-efficient record/replay of distributed systems; and (2) fully deterministic distributed behavior. Leveraging deterministic behavior at each node makes outgoing messages strictly a function of explicit inputs. This allows us to record the system by logging just message's arrival time, not the contents. Going further, we propose and implement an algorithm that makes all communication between nodes deterministic by scheduling communication onto a global logical timeline. We implement both algorithms in a system called DDOS and evaluate our system with parallel scientific applications, an HTTP/memcached system and a distributed microbenchmark with a high volume of peer-to-peer communication. Our results show up to two orders of magnitude reduction in log size of record/replay, and that distributed systems can be made deterministic with an order of magnitude of overhead.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "ASPLOS '13 conference proceedings.", } @Article{Joao:2013:UBA, author = "Jos{\'e} A. Joao and M. Aater Suleman and Onur Mutlu and Yale N. Patt", title = "Utility-based acceleration of multithreaded applications on asymmetric {CMPs}", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "154--165", month = jun, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2508148.2485936", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "Asymmetric Chip Multiprocessors (ACMPs) are becoming a reality. ACMPs can speed up parallel applications if they can identify and accelerate code segments that are critical for performance. Proposals already exist for using coarse-grained thread scheduling and fine-grained bottleneck acceleration. Unfortunately, there have been no proposals offered thus far to decide which code segments to accelerate in cases where both coarse-grained thread scheduling and fine-grained bottleneck acceleration could have value. This paper proposes Utility-Based Acceleration of Multithreaded Applications on Asymmetric CMPs (UBA), a cooperative software/hardware mechanism for identifying and accelerating the most likely critical code segments from a set of multithreaded applications running on an ACMP. The key idea is a new Utility of Acceleration metric that quantifies the performance benefit of accelerating a bottleneck or a thread by taking into account both the criticality and the expected speedup. UBA outperforms the best of two state-of-the-art mechanisms by 11\% for single application workloads and by 7\% for two-application workloads on an ACMP with 52 small cores and 3 large cores.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Kambadur:2013:PSP, author = "Melanie Kambadur and Kui Tang and Joshua Lopez and Martha A. Kim", title = "Parallel scaling properties from a basic block view", journal = j-SIGMETRICS, volume = "41", number = "1", pages = "365--366", month = jun, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2494232.2465748", ISSN = "0163-5999 (print), 1557-9484 (electronic)", ISSN-L = "0163-5999", bibdate = "Fri Feb 28 06:09:59 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigmetrics.bib", abstract = "As software scalability lags behind hardware parallelism, understanding scaling behavior is more important than ever. This paper demonstrates how to use Parallel Block Vector (PBV) profiles to measure the scaling properties of multithreaded programs from a new perspective: the basic block's view. Through this lens, we guide users through quick and simple methods to produce high-resolution application scaling analyses. This method requires no manual program modification, new hardware, or lengthy simulations, and captures the impact of architecture, operating systems, threading models, and inputs. We apply these techniques to a set of parallel benchmarks, and, as an example, demonstrate that when it comes to scaling, functions in an application do not behave monolithically.", acknowledgement = ack-nhfb, fjournal = "ACM SIGMETRICS Performance Evaluation Review", journal-URL = "http://portal.acm.org/toc.cfm?id=J618", } @Article{Kim:2013:DBC, author = "Hwanju Kim and Sangwook Kim and Jinkyu Jeong and Joonwon Lee and Seungryoul Maeng", title = "Demand-based coordinated scheduling for {SMP VMs}", journal = j-SIGPLAN, volume = "48", number = "4", pages = "369--380", month = apr, year = "2013", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2499368.2451156", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Mon Jul 1 17:15:23 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "As processor architectures have been enhancing their computing capacity by increasing core counts, independent workloads can be consolidated on a single node for the sake of high resource efficiency in data centers. With the prevalence of virtualization technology, each individual workload can be hosted on a virtual machine for strong isolation between co-located workloads. Along with this trend, hosted applications have increasingly been multithreaded to take advantage of improved hardware parallelism. Although the performance of many multithreaded applications highly depends on communication (or synchronization) latency, existing schemes of virtual machine scheduling do not explicitly coordinate virtual CPUs based on their communication behaviors. This paper presents a demand-based coordinated scheduling scheme for consolidated virtual machines that host multithreaded workloads. To this end, we propose communication-driven scheduling that controls time-sharing in response to inter-processor interrupts (IPIs) between virtual CPUs. On the basis of in-depth analysis on the relationship between IPI communications and coordination demands, we devise IPI-driven coscheduling and delayed preemption schemes, which effectively reduce synchronization latency and unnecessary CPU consumption. In addition, we introduce a load-conscious CPU allocation policy in order to address load imbalance in heterogeneously consolidated environments. The proposed schemes are evaluated with respect to various scenarios of mixed workloads using the PARSEC multithreaded applications. In the evaluation, our scheme improves the overall performance of consolidated workloads, especially communication-intensive applications, by reducing inefficient synchronization latency.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "ASPLOS '13 conference proceedings.", } @Article{LaFratta:2013:EEM, author = "Patrick A. {La Fratta} and Peter M. Kogge", title = "Energy-efficient multithreading for a hierarchical heterogeneous multicore through locality-cognizant thread generation", journal = j-J-PAR-DIST-COMP, volume = "73", number = "12", pages = "1551--1562", month = dec, year = "2013", CODEN = "JPDCER", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Fri Nov 29 09:55:28 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.sciencedirect.com/science/journal/07437315", URL = "http://www.sciencedirect.com/science/article/pii/S0743731513001494", acknowledgement = ack-nhfb, fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315", } @Article{Lobeiras:2013:PSW, author = "Jacobo Lobeiras and Mois{\'e}s Vi{\~n}as and Margarita Amor and Basilio B. Fraguela and Manuel Arenaz and J. A. Garc{\'\i}a and M. J. Castro", title = "Parallelization of shallow water simulations on current multi-threaded systems", journal = j-IJHPCA, volume = "27", number = "4", pages = "493--512", month = nov, year = "2013", CODEN = "IHPCFL", DOI = "https://doi.org/10.1177/1094342012464800", ISSN = "1094-3420 (print), 1741-2846 (electronic)", ISSN-L = "1094-3420", bibdate = "Fri Mar 14 15:39:57 MDT 2014", bibsource = "http://hpc.sagepub.com/content/27/4.toc; https://www.math.utah.edu/pub/tex/bib/ijsa.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://hpc.sagepub.com/content/27/4/493.full.pdf+html", acknowledgement = ack-nhfb, fjournal = "International Journal of High Performance Computing Applications", journal-URL = "http://hpc.sagepub.com/content/by/year", onlinedate = "December 5, 2012", } @Article{Lu:2013:REM, author = "Kai Lu and Xu Zhou and Xiaoping Wang and Wenzhe Zhang and Gen Li", title = "{RaceFree}: an efficient multi-threading model for determinism", journal = j-SIGPLAN, volume = "48", number = "8", pages = "297--298", month = aug, year = "2013", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2517327.2442553", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Mon Aug 26 13:48:51 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", note = "PPoPP '13 Conference proceedings.", abstract = "Current deterministic systems generally incur large overhead due to the difficulty of detecting and eliminating data races. This paper presents RaceFree, a novel multi-threading runtime that adopts a relaxed deterministic model to provide a data-race-free environment for parallel programs. This model cuts off unnecessary shared-memory communication by isolating threads in separated memories, which eliminates direct data races. Meanwhile, we leverage the happen-before relation defined by applications themselves as one-way communication pipes to perform necessary thread communication. Shared-memory communication is transparently converted to message-passing style communication by our Memory Modification Propagation (MMP) mechanism, which propagates local memory modifications to other threads through the happen-before relation pipes. The overhead of RaceFree is 67.2\% according to our tests on parallel benchmarks.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Lucia:2013:CEF, author = "Brandon Lucia and Luis Ceze", title = "Cooperative empirical failure avoidance for multithreaded programs", journal = j-SIGPLAN, volume = "48", number = "4", pages = "39--50", month = apr, year = "2013", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2499368.2451121", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Mon Jul 1 17:15:23 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "Concurrency errors in multithreaded programs are difficult to find and fix. We propose Aviso, a system for avoiding schedule-dependent failures. Aviso monitors events during a program's execution and, when a failure occurs, records a history of events from the failing execution. It uses this history to generate schedule constraints that perturb the order of events in the execution and thereby avoids schedules that lead to failures in future program executions. Aviso leverages scenarios where many instances of the same software run, using a statistical model of program behavior and experimentation to determine which constraints most effectively avoid failures. After implementing Aviso, we showed that it decreased failure rates for a variety of important desktop, server, and cloud applications by orders of magnitude, with an average overhead of less than 20\% and, in some cases, as low as 5\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "ASPLOS '13 conference proceedings.", } @Article{Mahafzah:2013:PAM, author = "Basel A. Mahafzah", title = "Performance assessment of multithreaded quicksort algorithm on simultaneous multithreaded architecture", journal = j-J-SUPERCOMPUTING, volume = "66", number = "1", pages = "339--363", month = oct, year = "2013", CODEN = "JOSUED", DOI = "https://doi.org/10.1007/s11227-013-0910-2", ISSN = "0920-8542 (print), 1573-0484 (electronic)", ISSN-L = "0920-8542", bibdate = "Sat Feb 8 10:21:52 MST 2014", bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=66&issue=1; https://www.math.utah.edu/pub/tex/bib/jsuper.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer.com/article/10.1007/s11227-013-0910-2", acknowledgement = ack-nhfb, fjournal = "The Journal of Supercomputing", journal-URL = "http://link.springer.com/journal/11227", } @Article{McCreesh:2013:MTS, author = "Ciaran McCreesh and Patrick Prosser", title = "Multi-Threading a State-of-the-Art Maximum Clique Algorithm", journal = j-ALGORITHMS-BASEL, volume = "6", number = "4", pages = "618--635", month = dec, year = "2013", CODEN = "ALGOCH", DOI = "https://doi.org/10.3390/a6040618", ISSN = "1999-4893 (electronic)", ISSN-L = "1999-4893", bibdate = "Fri May 3 13:50:13 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/algorithms.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "https://www.mdpi.com/1999-4893/6/4/618", acknowledgement = ack-nhfb, fjournal = "Algorithms (Basel)", journal-URL = "https://www.mdpi.com/journal/algorithms", pubdates = "Received: 15 August 2013 / Revised: 13 September 2013 / Accepted: 18 September 2013 / Published: 3 October 2013", } @Article{Norris:2013:CCC, author = "Brian Norris and Brian Demsky", title = "{CDSChecker}: checking concurrent data structures written with {C\slash C++} atomics", journal = j-SIGPLAN, volume = "48", number = "10", pages = "131--150", month = oct, year = "2013", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2544173.2509514", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Mon Dec 9 09:19:33 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", note = "OOPSLA '13 conference proceedings.", abstract = "Writing low-level concurrent software has traditionally required intimate knowledge of the entire toolchain and often has involved coding in assembly. New language standards have extended C and C++ with support for low-level atomic operations and a weak memory model, enabling developers to write portable and efficient multithreaded code. Developing correct low-level concurrent code is well-known to be especially difficult under a weak memory model, where code behavior can be surprising. Building reliable concurrent software using C/C++ low-level atomic operations will likely require tools that help developers discover unexpected program behaviors. In this paper we present CDSChecker, a tool for exhaustively exploring the behaviors of concurrent code under the C/C++ memory model. We develop several novel techniques for modeling the relaxed behaviors allowed by the memory model and for minimizing the number of execution behaviors that CDSChecker must explore. We have used CDSChecker to exhaustively unit test several concurrent data structure implementations on specific inputs and have discovered errors in both a recently published C11 implementation of a work-stealing queue and a single producer, single consumer queue implementation.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Ossner:2013:GMB, author = "Christopher O{\ss}ner and Klemens B{\"o}hm", title = "Graphs for Mining-Based Defect Localization in Multithreaded Programs", journal = j-INT-J-PARALLEL-PROG, volume = "41", number = "4", pages = "570--593", month = aug, year = "2013", CODEN = "IJPPE5", DOI = "https://doi.org/10.1007/s10766-012-0237-2", ISSN = "0885-7458 (print), 1573-7640 (electronic)", ISSN-L = "0885-7458", bibdate = "Sat Jun 22 12:29:22 MDT 2013", bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=41&issue=4; https://www.math.utah.edu/pub/tex/bib/intjparallelprogram.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer.com/article/10.1007/s10766-012-0237-2", acknowledgement = ack-nhfb, fjournal = "International Journal of Parallel Programming", journal-URL = "http://link.springer.com/journal/10766", } @Article{Parashar:2013:TIC, author = "Angshuman Parashar and Michael Pellauer and Michael Adler and Bushra Ahsan and Neal Crago and Daniel Lustig and Vladimir Pavlov and Antonia Zhai and Mohit Gambhir and Aamer Jaleel and Randy Allmon and Rachid Rayess and Stephen Maresh and Joel Emer", title = "Triggered instructions: a control paradigm for spatially-programmed architectures", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "142--153", month = jun, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2508148.2485935", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "In this paper, we present triggered instructions, a novel control paradigm for arrays of processing elements (PEs) aimed at exploiting spatial parallelism. Triggered instructions completely eliminate the program counter and allow programs to transition concisely between states without explicit branch instructions. They also allow efficient reactivity to inter-PE communication traffic. The approach provides a unified mechanism to avoid over-serialized execution, essentially achieving the effect of techniques such as dynamic instruction reordering and multithreading, which each require distinct hardware mechanisms in a traditional sequential architecture. Our analysis shows that a triggered-instruction based spatial accelerator can achieve 8X greater area-normalized performance than a traditional general-purpose processor. Further analysis shows that triggered control reduces the number of static and dynamic instructions in the critical paths by 62\% and 64\% respectively over a program-counter style spatial baseline, resulting in a speedup of 2.0X.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Pokam:2013:QPI, author = "Gilles Pokam and Klaus Danne and Cristiano Pereira and Rolf Kassa and Tim Kranich and Shiliang Hu and Justin Gottschlich and Nima Honarmand and Nathan Dautenhahn and Samuel T. King and Josep Torrellas", title = "{QuickRec}: prototyping an {Intel} architecture extension for record and replay of multithreaded programs", journal = j-COMP-ARCH-NEWS, volume = "41", number = "3", pages = "643--654", month = jun, year = "2013", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2508148.2485977", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Sat Jul 27 06:58:55 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", note = "ICSA '13 conference proceedings.", abstract = "There has been significant interest in hardware-assisted deterministic Record and Replay (RnR) systems for multithreaded programs on multiprocessors. However, no proposal has implemented this technique in a hardware prototype with full operating system support. Such an implementation is needed to assess RnR practicality. This paper presents QuickRec, the first multicore Intel Architecture (IA) prototype of RnR for multithreaded programs. QuickRec is based on QuickIA, an Intel emulation platform for rapid prototyping of new IA extensions. QuickRec is composed of a Xeon server platform with FPGA-emulated second-generation Pentium cores, and Capo3, a full software stack for managing the recording hardware from within a modified Linux kernel. This paper's focus is understanding and evaluating the implementation issues of RnR on a real platform. Our effort leads to some lessons learned, as well as to some pointers for future research. We demonstrate that RnR can be implemented efficiently on a real multicore IA system. In particular, we show that the rate of memory log generation is insignificant, and that the recording hardware has negligible performance overhead. However, the software stack incurs an average recording overhead of nearly 13\%, which must be reduced to enable always-on use of RnR.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Raychev:2013:ERD, author = "Veselin Raychev and Martin Vechev and Manu Sridharan", title = "Effective race detection for event-driven programs", journal = j-SIGPLAN, volume = "48", number = "10", pages = "151--166", month = oct, year = "2013", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2544173.2509538", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Mon Dec 9 09:19:33 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", note = "OOPSLA '13 conference proceedings.", abstract = "Like shared-memory multi-threaded programs, event-driven programs such as client-side web applications are susceptible to data races that are hard to reproduce and debug. Race detection for such programs is hampered by their pervasive use of ad hoc synchronization, which can lead to a prohibitive number of false positives. Race detection also faces a scalability challenge, as a large number of short-running event handlers can quickly overwhelm standard vector-clock-based techniques. This paper presents several novel contributions that address both of these challenges. First, we introduce race coverage, a systematic method for exposing ad hoc synchronization and other (potentially harmful) races to the user, significantly reducing false positives. Second, we present an efficient connectivity algorithm for computing race coverage. The algorithm is based on chain decomposition and leverages the structure of event-driven programs to dramatically decrease the overhead of vector clocks. We implemented our techniques in a tool called EventRacer and evaluated it on a number of public web sites. The results indicate substantial performance and precision improvements of our approach over the state-of-the-art. Using EventRacer, we found many harmful races, most of which are beyond the reach of current techniques.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", } @Article{Saez:2013:DFP, author = "Juan Carlos S{\'a}ez and Fernando Castro and Daniel Chaver and Manuel Prieto", title = "Delivering fairness and priority enforcement on asymmetric multicore systems via {OS} scheduling", journal = j-SIGMETRICS, volume = "41", number = "1", pages = "343--344", month = jun, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2494232.2465532", ISSN = "0163-5999 (print), 1557-9484 (electronic)", ISSN-L = "0163-5999", bibdate = "Fri Feb 28 06:09:59 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigmetrics.bib", abstract = "Symmetric-ISA (instruction set architecture) asymmetric-performance multicore processors (AMPs) were shown to deliver higher performance per watt and area than symmetric CMPs for applications with diverse architectural requirements. So, it is likely that future multicore processors will combine big power-hungry fast cores and small low-power slow ones. In this paper, we propose a novel thread scheduling algorithm that aims to improve the throughput-fairness trade-off on AMP systems. Our experimental evaluation on real hardware and using scheduler implementations on a general-purpose operating system, reveals that our proposal delivers a better throughput-fairness trade-off than previous schedulers for a wide variety of multi-application workloads including single-threaded and multithreaded applications.", acknowledgement = ack-nhfb, fjournal = "ACM SIGMETRICS Performance Evaluation Review", journal-URL = "http://portal.acm.org/toc.cfm?id=J618", } @Article{Sinenian:2013:MMS, author = "Nareg Sinenian and Alex B. Zylstra and Mario J.-E. Manuel and Johan A. Frenje and Atma D. Kanojia and Joshua Stillerman and Richard D. Petrasso", title = "A Multithreaded Modular Software Toolkit for Control of Complex Experiments", journal = j-COMPUT-SCI-ENG, volume = "15", number = "1", pages = "66--75", month = jan # "\slash " # feb, year = "2013", CODEN = "CSENFA", DOI = "https://doi.org/10.1109/MCSE.2012.34", ISSN = "1521-9615", ISSN-L = "1521-9615", bibdate = "Fri Jun 21 08:34:49 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/computscieng.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Computing in Science and Engineering", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5992", } @Article{So:2013:STI, author = "Won So and Alexander G. Dean", title = "Software thread integration for instruction-level parallelism", journal = j-TECS, volume = "13", number = "1", pages = "8:1--8:??", month = aug, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2512466", ISSN = "1539-9087 (print), 1558-3465 (electronic)", ISSN-L = "1539-9087", bibdate = "Thu Sep 5 19:03:11 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/tecs.bib", abstract = "Multimedia applications require a significantly higher level of performance than previous workloads of embedded systems. They have driven digital signal processor (DSP) makers to adopt high-performance architectures like VLIW (Very-Long Instruction Word). Despite many efforts to exploit instruction-level parallelism (ILP) in the application, the speed is a fraction of what it could be, limited by the difficulty of finding enough independent instructions to keep all of the processor's functional units busy. This article proposes Software Thread Integration (STI) for instruction-level parallelism. STI is a software technique for interleaving multiple threads of control into a single implicitly multithreaded one. We use STI to improve the performance on ILP processors by merging parallel procedures into one, increasing the compiler's scope and hence allowing it to create a more efficient instruction schedule. Assuming the parallel procedures are given, we define a methodology for finding the best performing integrated procedure with a minimum compilation time. We quantitatively estimate the performance impact of integration, allowing various integration scenarios to be compared and ranked via profitability analysis. During integration of threads, different ILP-improving code transformations are selectively applied according to the control structure and the ILP characteristics of the code, driven by interactions with software pipelining. The estimated profitability is verified and corrected by an iterative compilation approach, compensating for possible estimation inaccuracy. Our modeling methods combined with limited compilation quickly find the best integration scenario without requiring exhaustive integration.", acknowledgement = ack-nhfb, articleno = "8", fjournal = "ACM Transactions on Embedded Computing Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?&idx=J840", } @Article{Taft:2013:TPS, author = "S. Tucker Taft", title = "Tutorial: proving safety of parallel \slash multi-threaded programs", journal = j-SIGADA-LETTERS, volume = "33", number = "3", pages = "1--2", month = dec, year = "2013", CODEN = "AALEE5", DOI = "https://doi.org/10.1145/2658982.2527285", ISSN = "1094-3641 (print), 1557-9476 (electronic)", ISSN-L = "1094-3641", bibdate = "Wed Sep 3 16:38:30 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/java2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigada.bib", abstract = "This tutorial will introduce the attendees to analysis and proof techniques for programs using parallelism and multi-threading. There are no specific prerequisites, but a familiarity with the notions of preconditions and postconditions, aliasing, race conditions, and deadlocks would be of value. The examples will be based on the threading and parallelism models of Java, Ada, and two new parallel languages, one called ParaSail [4] and another, inspired by the verifiable SPARK[1][2] subset of Ada, called Sparkel[3]. We will introduce the distinction between safety and liveness properties, and then focus primarily on techniques for the verification of safety properties, including the absence of race conditions and deadlocks. We will also discuss the issue of determinism vs. non-determinism in parallel and multi-threaded programs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGADA Ada Letters", journal-URL = "http://portal.acm.org/citation.cfm?id=J32", remark = "HILT '13 conference proceedings.", } @Article{Tembey:2013:SSS, author = "Priyanka Tembey and Augusto Vega and Alper Buyuktosunoglu and Dilma Da Silva and Pradip Bose", title = "{SMT} switch: Software Mechanisms for Power Shifting", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "12", number = "2", pages = "67--70", month = jul # "\slash " # dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2012.26", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Jun 20 17:18:18 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Simultaneous multithreading (SMT) as a processor design to achieve higher levels of system and application throughput is a well-accepted and deployed technique in most desktop and server processors. We study the power implications of varying SMT levels i.e., thread counts per core for various multi-threaded applications on a real SMT multicore platform, and introduce a novel software mechanism of changing SMT level of a core to tune platform power. Power-shifting policies by varying per core SMT levels for performance benefits within a power cap are introduced. Projected power savings (of 15\%) for a streaming parallel benchmark can be attained using SMT-level power shifting mechanisms.", acknowledgement = ack-nhfb, affiliation = "Tembey, P (Reprint Author), Georgia Tech, Atlanta, GA 30332 USA. Tembey, Priyanka, Georgia Tech, Atlanta, GA 30332 USA.", da = "2019-06-20", doc-delivery-number = "279CD", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Multicore platforms; Operating Systems; Power shifting; SMT", number-of-cited-references = "11", research-areas = "Computer Science", times-cited = "0", unique-id = "Tembey:2013:SSS", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Wester:2013:PDR, author = "Benjamin Wester and David Devecsery and Peter M. Chen and Jason Flinn and Satish Narayanasamy", title = "Parallelizing data race detection", journal = j-SIGPLAN, volume = "48", number = "4", pages = "27--38", month = apr, year = "2013", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2499368.2451120", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Mon Jul 1 17:15:23 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "Detecting data races in multithreaded programs is a crucial part of debugging such programs, but traditional data race detectors are too slow to use routinely. This paper shows how to speed up race detection by spreading the work across multiple cores. Our strategy relies on uniparallelism, which executes time intervals of a program (called epochs ) in parallel to provide scalability, but executes all threads from a single epoch on a single core to eliminate locking overhead. We use several techniques to make parallelization effective: dividing race detection into three phases, predicting a subset of the analysis state, eliminating sequential work via transitive reduction, and reducing the work needed to maintain multiple versions of analysis via factorization. We demonstrate our strategy by parallelizing a happens-before detector and a lockset-based detector. We find that uniparallelism can significantly speed up data race detection. With 4x the number of cores as the original application, our strategy speeds up the median execution time by 4.4x for a happens-before detector and 3.3x for a lockset race detector. Even on the same number of cores as the conventional detectors, the ability for uniparallelism to elide analysis locks allows it to reduce the median overhead by 13\% for a happens-before detector and 8\% for a lockset detector.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "ASPLOS '13 conference proceedings.", } @Article{Yu:2013:GDS, author = "Hongtao Yu and Hou-Jen Ko and Zhiyuan Li", title = "General data structure expansion for multi-threading", journal = j-SIGPLAN, volume = "48", number = "6", pages = "243--252", month = jun, year = "2013", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2499370.2462182", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Mon Jul 1 17:15:38 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "Among techniques for parallelizing sequential codes, privatization is a common and significant transformation performed by both compilers and runtime parallelizing systems. Without privatization, repetitive updates to the same data structures often introduce spurious data dependencies that hide the inherent parallelism. Unfortunately, it remains a significant challenge to compilers to automatically privatize dynamic and recursive data structures which appear frequently in real applications written in languages such as C/C++. This is because such languages lack a naming mechanism to define the address range of a pointer-based data structure, in contrast to arrays with explicitly declared bounds. In this paper we present a novel solution to this difficult problem by expanding general data structures such that memory accesses issued from different threads to contentious data structures are directed to different data fields. Based on compile-time type checking and a data dependence graph, this aggressive extension to the traditional scalar and array expansion isolates the address ranges among different threads, without struggling with privatization based on thread-private stacks, such that the targeted loop can be effectively parallelized. With this method fully implemented in GCC, experiments are conducted on a set of programs from well-known benchmark suites such as Mibench, MediaBench II and SPECint. Results show that the new approach can lead to a high speedup when executing the transformed code on multiple cores.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "PLDI '13 conference proceedings.", } @Article{Zarrabi:2013:LSF, author = "Amirreza Zarrabi and Khairulmizam Samsudin and Wan Azizun Wan Adnan", title = "{Linux} Support for Fast Transparent General Purpose Checkpoint\slash Restart of Multithreaded Processes in Loadable Kernel Module", journal = j-J-GRID-COMP, volume = "11", number = "2", pages = "187--210", month = jun, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1007/s10723-013-9248-5", ISSN = "1570-7873 (print), 1572-9184 (electronic)", ISSN-L = "1570-7873", bibdate = "Sat Jun 22 11:03:44 MDT 2013", bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=1570-7873&volume=11&issue=2; https://www.math.utah.edu/pub/tex/bib/jgridcomp.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer.com/article/10.1007/s10723-013-9248-5", acknowledgement = ack-nhfb, fjournal = "Journal of Grid Computing", journal-URL = "http://link.springer.com/journal/10723", } @Article{Awile:2014:PWF, author = "Omar Awile and Ivo F. Sbalzarini", title = "A {Pthreads} Wrapper for {Fortran 2003}", journal = j-TOMS, volume = "40", number = "3", pages = "19:1--19:15", month = apr, year = "2014", CODEN = "ACMSCU", DOI = "https://doi.org/10.1145/2558889", ISSN = "0098-3500 (print), 1557-7295 (electronic)", ISSN-L = "0098-3500", bibdate = "Mon Apr 21 17:42:14 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/fortran3.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/pvm.bib; https://www.math.utah.edu/pub/tex/bib/toms.bib", abstract = "With the advent of multicore processors, numerical and mathematical software relies on parallelism in order to benefit from hardware performance increases. We present the design and use of a Fortran 2003 wrapper for POSIX threads, called forthreads. Forthreads is complete in the sense that is provides native Fortran 2003 interfaces to all pthreads routines where possible. We demonstrate the use and efficiency of forthreads for SIMD parallelism and task parallelism. We present forthreads/MPI implementations that enable hybrid shared-/distributed-memory parallelism in Fortran 2003. Our benchmarks show that forthreads offers performance comparable to that of OpenMP, but better thread control and more freedom. We demonstrate the latter by presenting a multithreaded Fortran 2003 library for POSIX Internet sockets, enabling interactive numerical simulations with runtime control.", acknowledgement = ack-nhfb, articleno = "19", fjournal = "ACM Transactions on Mathematical Software (TOMS)", journal-URL = "http://dl.acm.org/pub.cfm?id=J782", } @Article{Bartolini:2014:AFG, author = "Davide B. Bartolini and Filippo Sironi and Donatella Sciuto and Marco D. Santambrogio", title = "Automated Fine-Grained {CPU} Provisioning for Virtual Machines", journal = j-TACO, volume = "11", number = "3", pages = "27:1--27:??", month = oct, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2637480", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Oct 27 17:02:20 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", abstract = "Ideally, the pay-as-you-go model of Infrastructure as a Service (IaaS) clouds should enable users to rent just enough resources (e.g., CPU or memory bandwidth) to fulfill their service level objectives (SLOs). Achieving this goal is hard on current IaaS offers, which require users to explicitly specify the amount of resources to reserve; this requirement is nontrivial for users, because estimating the amount of resources needed to attain application-level SLOs is often complex, especially when resources are virtualized and the service provider colocates virtual machines (VMs) on host nodes. For this reason, users who deploy VMs subject to SLOs are usually prone to overprovisioning resources, thus resulting in inflated business costs. This article tackles this issue with AutoPro: a runtime system that enhances IaaS clouds with automated and fine-grained resource provisioning based on performance SLOs. Our main contribution with AutoPro is filling the gap between application-level performance SLOs and allocation of a contended resource, without requiring explicit reservations from users. In this article, we focus on CPU bandwidth allocation to throughput-driven, compute-intensive multithreaded applications colocated on a multicore processor; we show that a theoretically sound, yet simple, control strategy can enable automated fine-grained allocation of this contended resource, without the need for offline profiling. Additionally, AutoPro helps service providers optimize infrastructure utilization by provisioning idle resources to best-effort workloads, so as to maximize node-level utilization. Our extensive experimental evaluation confirms that AutoPro is able to automatically determine and enforce allocations to meet performance SLOs while maximizing node-level utilization by supporting batch workloads on a best-effort basis.", acknowledgement = ack-nhfb, articleno = "27", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924", } @Article{Bergan:2014:SEM, author = "Tom Bergan and Dan Grossman and Luis Ceze", title = "Symbolic execution of multithreaded programs from arbitrary program contexts", journal = j-SIGPLAN, volume = "49", number = "10", pages = "491--506", month = oct, year = "2014", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2714064.2660200", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue May 12 17:41:21 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "We describe an algorithm to perform symbolic execution of a multithreaded program starting from an arbitrary program context. We argue that this can enable more efficient symbolic exploration of deep code paths in multithreaded programs by allowing the symbolic engine to jump directly to program contexts of interest. The key challenge is modeling the initial context with reasonable precision --- an overly approximate model leads to exploration of many infeasible paths during symbolic execution, while a very precise model would be so expensive to compute that computing it would defeat the purpose of jumping directly to the initial context in the first place. We propose a context-specific dataflow analysis that approximates the initial context cheaply, but precisely enough to avoid some common causes of infeasible-path explosion. This model is necessarily approximate --- it may leave portions of the memory state unconstrained, leaving our symbolic execution unable to answer simple questions such as ``which thread holds lock A?''. For such cases, we describe a novel algorithm for evaluating symbolic synchronization during symbolic execution. Our symbolic execution semantics are sound and complete up to the limits of the underlying SMT solver. We describe initial experiments on an implementation in Cloud 9.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "OOPSLA '14 conference proceedings.", } @Article{Bokhari:2014:MMM, author = "Shahid H. Bokhari and {\"U}mit V. {\c{C}}ataly{\"u}rek and Metin N. Gurcan", title = "Massively multithreaded maxflow for image segmentation on the {Cray XMT-2}", journal = j-CCPE, volume = "26", number = "18", pages = "2836--2855", day = "25", month = dec, year = "2014", CODEN = "CCPEBO", DOI = "https://doi.org/10.1002/cpe.3181", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Wed Feb 11 22:34:11 MST 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/ccpe.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Concurrency and Computation: Practice and Experience", journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626", onlinedate = "5 Dec 2013", } @Book{Butcher:2014:SCM, author = "Paul N. Butcher", title = "Seven concurrency models in seven weeks: when threads unravel", publisher = "The Pragmatic Bookshelf", address = "Dallas, TX, USA", pages = "xiii + 275", year = "2014", ISBN = "1-937785-65-3 (paperback), 1-941222-27-7 (e-book)", ISBN-13 = "978-1-937785-65-9 (paperback), 978-1-941222-27-0 (e-book)", LCCN = "QA76.642 .B88 2014", bibdate = "Thu Dec 4 13:32:20 MST 2014", bibsource = "fsz3950.oclc.org:210/WorldCat; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = "The Pragmatic Programmers", URL = "http://proquest.safaribooksonline.com/?fpi=9781941222737", acknowledgement = ack-nhfb, subject = "Computer multitasking; Parallel programming (Computer science); Nebenl{\"a}ufigkeit; Parallelverarbeitung", tableofcontents = "Introduction \\ Threads and locks \\ Functional programming \\ The Clojure way: separating identity from state \\ Actors \\ Communicating sequential processes \\ Data parallelism \\ The Lambda Architecture \\ Wrapping up", } @Article{Cai:2014:MSD, author = "Y. Cai and W. K. Chan", title = "{Magiclock}: Scalable Detection of Potential Deadlocks in Large-Scale Multithreaded Programs", journal = j-IEEE-TRANS-SOFTW-ENG, volume = "40", number = "3", pages = "266--281", month = mar, year = "2014", CODEN = "IESEDJ", DOI = "https://doi.org/10.1109/TSE.2014.2301725", ISSN = "0098-5589 (print), 1939-3520 (electronic)", ISSN-L = "0098-5589", bibdate = "Thu Feb 1 19:49:24 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranssoftweng2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=6718069", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Software Engineering", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=32", } @Article{Catano:2014:CSL, author = "N{\'e}stor Cata{\~n}o and Ijaz Ahmed and Radu I. Siminiceanu and Jonathan Aldrich", title = "A case study on the lightweight verification of a multi-threaded task server", journal = j-SCI-COMPUT-PROGRAM, volume = "80", number = "??", pages = "169--187", day = "1", month = feb, year = "2014", CODEN = "SCPGD4", ISSN = "0167-6423 (print), 1872-7964 (electronic)", ISSN-L = "0167-6423", bibdate = "Sat Nov 30 15:06:16 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/scicomputprogram.bib; http://www.sciencedirect.com/science/journal/01676423", URL = "http://www.sciencedirect.com/science/article/pii/S0167642313000178", acknowledgement = ack-nhfb, fjournal = "Science of Computer Programming", journal-URL = "http://www.sciencedirect.com/science/journal/01676423", } @Article{Che:2014:ALM, author = "Hao Che and Minh Nguyen", title = "{Amdahl's Law} for multithreaded multicore processors", journal = j-J-PAR-DIST-COMP, volume = "74", number = "10", pages = "3056--3069", month = oct, year = "2014", CODEN = "JPDCER", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Aug 21 16:26:06 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.sciencedirect.com/science/article/pii/S0743731514001142", acknowledgement = ack-nhfb, fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315/", } @Article{David:2014:CMC, author = "Florian David and Gael Thomas and Julia Lawall and Gilles Muller", title = "Continuously measuring critical section pressure with the free-lunch profiler", journal = j-SIGPLAN, volume = "49", number = "10", pages = "291--307", month = oct, year = "2014", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2714064.2660210", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue May 12 17:41:21 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "Today, Java is regularly used to implement large multi-threaded server-class applications that use locks to protect access to shared data. However, understanding the impact of locks on the performance of a system is complex, and thus the use of locks can impede the progress of threads on configurations that were not anticipated by the developer, during specific phases of the execution. In this paper, we propose Free Lunch, a new lock profiler for Java application servers, specifically designed to identify, in-vivo, phases where the progress of the threads is impeded by a lock. Free Lunch is designed around a new metric, critical section pressure (CSP), which directly correlates the progress of the threads to each of the locks. Using Free Lunch, we have identified phases of high CSP, which were hidden with other lock profilers, in the distributed Cassandra NoSQL database and in several applications from the DaCapo 9.12, the SPECjvm2008 and the SPECjbb2005 benchmark suites. Our evaluation of Free Lunch shows that its overhead is never greater than 6\%, making it suitable for in-vivo use.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "OOPSLA '14 conference proceedings.", } @Article{Esparza:2014:PBV, author = "Javier Esparza and Pierre Ganty and Tom{\'a}s Poch", title = "Pattern-Based Verification for Multithreaded Programs", journal = j-TOPLAS, volume = "36", number = "3", pages = "9:1--9:??", month = sep, year = "2014", CODEN = "ATPSDT", DOI = "https://doi.org/10.1145/2629644", ISSN = "0164-0925 (print), 1558-4593 (electronic)", ISSN-L = "0164-0925", bibdate = "Tue Oct 28 17:06:29 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/toplas/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/toplas.bib", abstract = "Pattern-based verification checks the correctness of program executions that follow a given pattern, a regular expression over the alphabet of program transitions of the form $ w_1^*, \ldots {},_n^* $ w. For multithreaded programs, the alphabet of the pattern is given by the reads and writes to the shared storage. We study the complexity of pattern-based verification for multithreaded programs with shared counters and finite variables. While unrestricted verification is undecidable for abstracted multithreaded programs with recursive procedures and PSPACE-complete for abstracted multithreaded while-programs (even without counters), we show that pattern-based verification is NP-complete for both classes, even in the presence of counters. We then conduct a multiparameter analysis to study the complexity of the problem on its three natural parameters (number of threads+counters+variables, maximal size of a thread, size of the pattern) and on two parameters related to thread structure (maximal number of procedures per thread and longest simple path of procedure calls). We present an algorithm that for a fixed number of threads, counters, variables, and pattern size solves the verification problem in $ {\rm st}^{O ({\rm lsp} + \lceil log ({\rm pr} + 1) \rceil)} $ time, where $ {\rm st} $ is the maximal size of a thread, $ {\rm pr} $ is the maximal number of procedures per thread, and $ {\rm lsp} $ is the longest simple path of procedure calls.", acknowledgement = ack-nhfb, articleno = "9", fjournal = "ACM Transactions on Programming Languages and Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783", } @Article{Eyerman:2014:RCW, author = "Stijn Eyerman and Lieven Eeckhout", title = "Restating the Case for Weighted-{IPC} Metrics to Evaluate Multiprogram Workload Performance", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "13", number = "2", pages = "93--96", month = jul # "\slash " # dec, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2013.9", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Weighted speedup is nowadays the most commonly used multiprogram workload performance metric. Weighted speedup is a weighted-IPC metric, i.e., the multiprogram IPC of each program is first weighted with its isolated IPC. Recently, Michaud questions the validity of weighted-IPC metrics by arguing that they are inconsistent and that weighted speedup favors unfairness [4]. Instead, he advocates using the arithmetic or harmonic mean of the raw IPC values of the programs in the multiprogram workload. We show that weighted-IPC metrics are not inconsistent, and that weighted speedup is fair in giving equal importance to each program. We argue that, in contrast to raw-IPC metrics, weighted-IPC metrics have a system-level meaning, and that raw-IPC metrics are affected by the inherent behavior of the programs. We also show that the choice of a metric may adversely affect the conclusions from an experiment. We suggest to use two weighted-IPC metrics-system throughput (STP) and average normalized turnaround time (ANTT)-for evaluating multiprogram workload performance, and to avoid raw-IPC metrics.", acknowledgement = ack-nhfb, affiliation = "Eyerman, S (Reprint Author), Univ Ghent, B-9000 Ghent, Belgium. Eyerman, Stijn; Eeckhout, Lieven, Univ Ghent, B-9000 Ghent, Belgium.", da = "2019-06-20", doc-delivery-number = "AX5PM", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Research Foundation --- Flanders (FWO); European Research Council under the European Community [259295]", funding-text = "Stijn Eyerman is supported through a postdoctoral fellowship by the Research Foundation --- Flanders (FWO). Additional support is provided by the European Research Council under the European Community's Seventh Framework Programme (FP7/2007-2013) / ERC Grant agreement no. 259295.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "ANTT; average normalized turnaround time; Benchmark testing; C Computer Systems Organization; C.1 Processor Architectures; C.1.3 Other Architecture Styles; C.1.3.h Multithreaded processors; C.1.4 Parallel Architectures; C.1.4.e Multi-core/single-chip multiprocessors; C.4 Performance of Systems; C.4.c Measurement techniques; Degradation; Harmonic analysis; harmonic mean; Multicore processing; multiprocessing systems; multiprogram IPC; multiprogram workload performance metric; multiprogramming; raw-IPC metrics; STP; system throughput; system-level meaning; Throughput; Weight measurement; weighted speedup; weighted-IPC metric", number-of-cited-references = "6", research-areas = "Computer Science", times-cited = "9", unique-id = "Eyerman:2014:RCW", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Fabregat-Traver:2014:SSG, author = "Diego Fabregat-Traver and Yurii S. Aulchenko and Paolo Bientinesi", title = "Solving sequences of generalized least-squares problems on multi-threaded architectures", journal = j-APPL-MATH-COMP, volume = "234", number = "??", pages = "606--617", day = "15", month = may, year = "2014", CODEN = "AMHCBQ", ISSN = "0096-3003 (print), 1873-5649 (electronic)", ISSN-L = "0096-3003", bibdate = "Mon Apr 21 18:04:13 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/applmathcomput2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.sciencedirect.com/science/article/pii/S0096300314002951", acknowledgement = ack-nhfb, fjournal = "Applied Mathematics and Computation", journal-URL = "http://www.sciencedirect.com/science/journal/00963003/", } @Article{Frincu:2014:ESV, author = "Marc E. Frincu and St{\'e}phane Genaud and Julien Gossa", title = "On the efficiency of several {VM} provisioning strategies for workflows with multi-threaded tasks on clouds", journal = j-COMPUTING, volume = "96", number = "11", pages = "1059--1086", month = nov, year = "2014", CODEN = "CMPTA2", DOI = "https://doi.org/10.1007/s00607-014-0410-0", ISSN = "0010-485X (print), 1436-5057 (electronic)", ISSN-L = "0010-485X", bibdate = "Wed Feb 11 07:42:25 MST 2015", bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0010-485X&volume=96&issue=11; https://www.math.utah.edu/pub/tex/bib/computing.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", URL = "http://link.springer.com/article/10.1007/s00607-014-0410-0", acknowledgement = ack-nhfb, fjournal = "Computing", journal-URL = "http://link.springer.com/journal/607", } @Article{Gerakios:2014:SSG, author = "Prodromos Gerakios and Nikolaos Papaspyrou and Konstantinos Sagonas", title = "Static safety guarantees for a low-level multithreaded language with regions", journal = j-SCI-COMPUT-PROGRAM, volume = "80", number = "??", pages = "223--263", day = "1", month = feb, year = "2014", CODEN = "SCPGD4", ISSN = "0167-6423 (print), 1872-7964 (electronic)", ISSN-L = "0167-6423", bibdate = "Sat Nov 30 15:06:20 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/scicomputprogram.bib; http://www.sciencedirect.com/science/journal/01676423", URL = "http://www.sciencedirect.com/science/article/pii/S0167642313001433", acknowledgement = ack-nhfb, fjournal = "Science of Computer Programming", journal-URL = "http://www.sciencedirect.com/science/journal/01676423", } @Article{Giceva:2014:DQP, author = "Jana Giceva and Gustavo Alonso and Timothy Roscoe and Tim Harris", title = "Deployment of query plans on multicores", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "3", pages = "233--244", month = nov, year = "2014", CODEN = "????", ISSN = "2150-8097", ISSN-L = "2150-8097", bibdate = "Mon Feb 9 18:24:34 MST 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Efficient resource scheduling of multithreaded software on multicore hardware is difficult given the many parameters involved and the hardware heterogeneity of existing systems. In this paper we explore the efficient deployment of query plans over a multicore machine. We focus on shared query systems, and implement the proposed ideas using SharedDB. The goal of the paper is to explore how to deliver maximum performance and predictability, while minimizing resource utilization when deploying query plans on multicore machines. We propose to use resource activity vectors to characterize the behavior of individual database operators. We then present a novel deployment algorithm which uses these vectors together with dataflow information from the query plan to optimally assign relational operators to physical cores. Experiments demonstrate that this approach significantly reduces resource requirements while preserving performance and is robust across different server architectures.", acknowledgement = ack-nhfb, fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Gonzalez-Mesa:2014:ETM, author = "M. A. Gonzalez-Mesa and Eladio Gutierrez and Emilio L. Zapata and Oscar Plata", title = "Effective Transactional Memory Execution Management for Improved Concurrency", journal = j-TACO, volume = "11", number = "3", pages = "24:1--24:??", month = oct, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2633048", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Oct 27 17:02:20 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "This article describes a transactional memory execution model intended to exploit maximum parallelism from sequential and multithreaded programs. A program code section is partitioned into chunks that will be mapped onto threads and executed transactionally. These transactions run concurrently and out of order, trying to exploit maximum parallelism but managed by a specific fully distributed commit control to meet data dependencies. To accomplish correct parallel execution, a partial precedence order relation is derived from the program code section and/or defined by the programmer. When a conflict between chunks is eagerly detected, the precedence order relation is used to determine the best policy to solve the conflict that preserves the precedence order while maximizing concurrency. The model defines a new transactional state called executed but not committed. This state allows exploiting concurrency on two levels: intrathread and interthread. Intrathread concurrency is improved by having pending uncommitted transactions while executing a new one in the same thread. The new state improves interthread concurrency because it permits out-of-order transaction commits regarding the precedence order. Our model has been implemented in a lightweight software transactional memory system, TinySTM, and has been evaluated on a set of benchmarks obtaining an important performance improvement over the baseline TM system.", acknowledgement = ack-nhfb, articleno = "24", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924", } @Article{Guzzi:2014:CPP, author = "P. H. Guzzi and G. Agapito and M. Cannataro", title = "{coreSNP}: Parallel Processing of Microarray Data", journal = j-IEEE-TRANS-COMPUT, volume = "63", number = "12", pages = "2961--2974", month = dec, year = "2014", CODEN = "ITCOB4", DOI = "https://doi.org/10.1109/TC.2013.176", ISSN = "0018-9340 (print), 1557-9956 (electronic)", ISSN-L = "0018-9340", bibdate = "Thu Dec 4 10:36:57 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Computers", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12", keywords = "Affymetrix; bioinformatics; Bioinformatics; Bioinformatics (genome or protein) databases; coreSNP parallel software tool; distributed programming; distributed systems; DMET SNP microarray data; DNA; Drug Metabolism Enzymes and Transporters; drug response; drug therapy improvement; drug toxicity; Drugs; drugs; enzymes; experimental data analysis; experimental data preprocessing; experimental data storage; gene expression; genetic variation; genetics; Genomics; genomics; genomics diffusion; graphical user interface; graphical user interfaces; health care; healthcare; high-throughput technologies; information retrieval; lab-on-a-chip; maximum drug efficacy; medical information systems; microarray data; minimal adverse effects; multi-threading; next generation sequencing; parallel processing; Parallel processing; patient genotype; performance evaluation; pharmacogenomics analysis pipeline; response times; scalable multithreaded implementation; single-nucleotide polymorphisms; SNP annotation; statistical analysis; Statistical analysis; statistical software; Throughput", } @Article{Hayden:2014:KEG, author = "Christopher M. Hayden and Karla Saur and Edward K. Smith and Michael Hicks and Jeffrey S. Foster", title = "{Kitsune}: Efficient, General-Purpose Dynamic Software Updating for {C}", journal = j-TOPLAS, volume = "36", number = "4", pages = "13:1--13:??", month = oct, year = "2014", CODEN = "ATPSDT", DOI = "https://doi.org/10.1145/2629460", ISSN = "0164-0925 (print), 1558-4593 (electronic)", ISSN-L = "0164-0925", bibdate = "Tue Oct 28 17:05:40 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/toplas/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/toplas.bib", abstract = "Dynamic software updating (DSU) systems facilitate software updates to running programs, thereby permitting developers to add features and fix bugs without downtime. This article introduces Kitsune, a DSU system for C. Kitsune's design has three notable features. First, Kitsune updates the whole program, rather than individual functions, using a mechanism that places no restrictions on data representations or allowed compiler optimizations. Second, Kitsune makes the important aspects of updating explicit in the program text, making the program's semantics easy to understand while minimizing programmer effort. Finally, the programmer can write simple specifications to direct Kitsune to generate code that traverses and transforms old-version state for use by new code; such state transformation is often necessary and is significantly more difficult in prior DSU systems. We have used Kitsune to update six popular, open-source, single- and multithreaded programs and find that few program changes are required to use Kitsune, that it incurs essentially no performance overhead, and that update times are fast.", acknowledgement = ack-nhfb, articleno = "13", fjournal = "ACM Transactions on Programming Languages and Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783", } @Article{Honarmand:2014:RRR, author = "Nima Honarmand and Josep Torrellas", title = "{RelaxReplay}: record and replay for relaxed-consistency multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "223--238", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541979", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Aug 18 17:12:47 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Record and Deterministic Replay (RnR) of multithreaded programs on relaxed-consistency multiprocessors has been a long-standing problem. While there are designs that work for Total Store Ordering (TSO), finding a general solution that is able to record the access reordering allowed by any relaxed-consistency model has proved challenging. This paper presents the first complete solution for hard-ware-assisted memory race recording that works for any relaxed-consistency model of current processors. With the scheme, called RelaxReplay, we can build an RnR system for any relaxed-consistency model and coherence protocol. RelaxReplay's core innovation is a new way of capturing memory access reordering. Each memory instruction goes through a post-completion in-order counting step that detects any reordering, and efficiently records it. We evaluate RelaxReplay with simulations of an 8-core release-consistent multicore running SPLASH-2 programs. We observe that RelaxReplay induces negligible overhead during recording. In addition, the average size of the log produced is comparable to the log sizes reported for existing solutions, and still very small compared to the memory bandwidth of modern machines. Finally, deterministic replay is efficient and needs minimal hardware support.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", remark = "ASPLOS '14 conference proceedings.", } @Article{Kaiser:2014:WAM, author = "Alexander Kaiser and Daniel Kroening and Thomas Wahl", title = "A Widening Approach to Multithreaded Program Verification", journal = j-TOPLAS, volume = "36", number = "4", pages = "14:1--14:??", month = oct, year = "2014", CODEN = "ATPSDT", DOI = "https://doi.org/10.1145/2629608", ISSN = "0164-0925 (print), 1558-4593 (electronic)", ISSN-L = "0164-0925", bibdate = "Tue Oct 28 17:05:40 MDT 2014", bibsource = "http://www.acm.org/pubs/contents/journals/toplas/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/toplas.bib", abstract = "Pthread-style multithreaded programs feature rich thread communication mechanisms, such as shared variables, signals, and broadcasts. In this article, we consider the automated verification of such programs where an unknown number of threads execute a given finite-data procedure in parallel. Such procedures are typically obtained as predicate abstractions of recursion-free source code written in C or Java. Many safety problems over finite-data replicated multithreaded programs are decidable via a reduction to the coverability problem in certain types of well-ordered infinite-state transition systems. On the other hand, in full generality, this problem is Ackermann-hard, which seems to rule out efficient algorithmic treatment. We present a novel, sound, and complete yet empirically efficient solution. Our approach is to judiciously widen the original set of coverability targets by configurations that involve fewer threads and are thus easier to decide, and whose exploration may well be sufficient: if they turn out uncoverable, so are the original targets. To soften the impact of ``bad guesses''-configurations that turn out coverable-the exploration is accompanied by a parallel engine that generates coverable configurations; none of these is ever selected for widening. Its job being merely to prevent bad widening choices, such an engine need not be complete for coverability analysis, which enables a range of existing partial (e.g., nonterminating) techniques. We present extensive experiments on multithreaded C programs, including device driver code from FreeBSD, Solaris, and Linux distributions. Our approach outperforms existing coverability methods by orders of magnitude.", acknowledgement = ack-nhfb, articleno = "14", fjournal = "ACM Transactions on Programming Languages and Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783", } @Article{Kim:2014:SMC, author = "S. Kim", title = "Synthesizing Multithreaded Code from Real-Time Object-Oriented Models via Schedulability-Aware Thread Derivation", journal = j-IEEE-TRANS-SOFTW-ENG, volume = "40", number = "4", pages = "413--426", month = apr, year = "2014", CODEN = "IESEDJ", DOI = "https://doi.org/10.1109/TSE.2013.47", ISSN = "0098-5589 (print), 1939-3520 (electronic)", ISSN-L = "0098-5589", bibdate = "Thu Feb 1 19:49:24 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranssoftweng2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=6617637", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Software Engineering", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=32", } @InProceedings{Knopp:2014:EMT, author = "T. Knopp", booktitle = "{2014 First Workshop for High Performance Technical Computing in Dynamic Languages}", title = "Experimental Multi-threading Support for the {Julia} Programming Language", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "1--5", year = "2014", DOI = "https://doi.org/10.1109/HPTCDL.2014.11", bibdate = "Thu Apr 8 07:17:08 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/julia.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Julia programming language", } @Article{Kvatinsky:2014:MBM, author = "Shahar Kvatinsky and Yuval H. Nacson and Yoav Etsion and Eby G. Friedman and Avinoam Kolodny and Uri C. Weiser", title = "Memristor-Based Multithreading", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "13", number = "1", pages = "41--44", month = jan # "\slash " # jun, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2013.3", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Jun 20 17:18:18 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Switch on Event Multithreading (SoE MT, also known as coarse-grained MT and block MT) processors run multiple threads on a pipeline machine, while the pipeline switches threads on stall events (e.g., cache miss). The thread switch penalty is determined by the number of stages in the pipeline that are flushed of in-flight instructions. In this paper, Continuous Flow Multithreading (CFMT), a new architecture of SoE MT, is introduced. In CFMT, a multistate pipeline register (MPR) holds the microarchitectural state of multiple different threads within the execution pipeline stages, where only one thread is active at a time. The MPRs eliminate the need to flush in-flight instructions and therefore significantly improve performance. In recent years, novel memory technologies such as Resistive RAM (RRAM) and Spin Torque Transfer Magnetoresistive RAM (STT-MRAM), have been developed. All of these technologies are nonvolatile, store data as resistance, and can be described as ``{memristors.''} Memristors are power efficient, dense, and fast as compared to standard memory technologies such as SRAM, DRAM, and Flash. Memristors therefore provide the opportunity to place the MPRs physically within the pipeline stages. A performance analysis of CFMT is compared to conventional SoE MT processors, demonstrating up to a 2X performance improvement, while the operational mechanism, due to the use of memristors, is low power and low complexity as compared to conventional SoE MT processors.", acknowledgement = ack-nhfb, affiliation = "Kvatinsky, S (Reprint Author), Technion Israel Inst Technol, Dept Elect Engn, IL-32000 Haifa, Israel. Kvatinsky, Shahar; Etsion, Yoav; Kolodny, Avinoam; Weiser, Uri C., Technion Israel Inst Technol, Dept Elect Engn, IL-32000 Haifa, Israel. Etsion, Yoav, Technion Israel Inst Technol, Dept Comp Sci, IL-32000 Haifa, Israel. Friedman, Eby G., Univ Rochester, Dept Elect \& Comp Engn, Rochester, NY 14627 USA.", author-email = "skva@tx.technion.ac.il", da = "2019-06-20", doc-delivery-number = "AT5MU", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Hasso Plattner Institute", funding-text = "This work was supported by the Hasso Plattner Institute. The authors thank Ravi Patel for his comments and area overhead estimation and to Nimrod Wald and Guy Satat for their help in evaluating the architecture.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "memristor; multithreaded processors; phase change memory; RRAM, STT-MRAM", keywords-plus = "RESISTIVE SWITCHING MEMORIES", number-of-cited-references = "21", research-areas = "Computer Science", times-cited = "10", unique-id = "Kvatinsky:2014:MBM", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Li:2014:PDC, author = "Yong Li and R. Melhem and A. K. Jones", title = "A Practical Data Classification Framework for Scalable and High Performance Chip-Multiprocessors", journal = j-IEEE-TRANS-COMPUT, volume = "63", number = "12", pages = "2905--2918", month = dec, year = "2014", CODEN = "ITCOB4", DOI = "https://doi.org/10.1109/TC.2013.161", ISSN = "0018-9340 (print), 1557-9956 (electronic)", ISSN-L = "0018-9340", bibdate = "Thu Dec 4 10:36:57 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Computers", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12", keywords = "application-specific characteristics; Benchmark testing; cache coherence; cache coherence design; cache storage; chip multiprocessor; Coherence; coherence directory; coherence overhead mitigation; coherence traffic; compiler-assisted mechanism; compilers; data access behavior; data access latency mitigation; data classification; data classification scheme; Dynamic scheduling; Instruction sets; interconnect; many-core architectures; microarchitectural constructs; multi-threaded parallel; NUCA-based caching; OpenMP; Optimization; parallel applications; parallel architectures; pattern classification; performance evaluation; performance improvement; pipelined parallel; Practically private; practically private; program compilers; Resource management; Runtime; scalable high-performance parallel systems; TLB; ubiquitous computing", } @Article{Liu:2014:PPF, author = "Tongping Liu and Chen Tian and Ziang Hu and Emery D. Berger", title = "{PREDATOR}: predictive false sharing detection", journal = j-SIGPLAN, volume = "49", number = "8", pages = "3--14", month = aug, year = "2014", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2692916.2555244", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Nov 26 16:26:30 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "False sharing is a notorious problem for multithreaded applications that can drastically degrade both performance and scalability. Existing approaches can precisely identify the sources of false sharing, but only report false sharing actually observed during execution; they do not generalize across executions. Because false sharing is extremely sensitive to object layout, these detectors can easily miss false sharing problems that can arise due to slight differences in memory allocation order or object placement decisions by the compiler. In addition, they cannot predict the impact of false sharing on hardware with different cache line sizes. This paper presents PREDATOR, a predictive software-based false sharing detector. PREDATOR generalizes from a single execution to precisely predict false sharing that is latent in the current execution. PREDATOR tracks accesses within a range that could lead to false sharing given different object placement. It also tracks accesses within virtual cache lines, contiguous memory ranges that span actual hardware cache lines, to predict sharing on hardware platforms with larger cache line sizes. For each, it reports the exact program location of predicted false sharing problems, ranked by their projected impact on performance. We evaluate PREDATOR across a range of benchmarks and actual applications. PREDATOR identifies problems undetectable with previous tools, including two previously-unknown false sharing problems, with no false positives. PREDATOR is able to immediately locate false sharing problems in MySQL and the Boost library that had eluded detection for years.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "PPoPP '14 conference proceedings.", } @Article{Liu:2014:TAP, author = "Xu Liu and John Mellor-Crummey", title = "A tool to analyze the performance of multithreaded programs on {NUMA} architectures", journal = j-SIGPLAN, volume = "49", number = "8", pages = "259--272", month = aug, year = "2014", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2692916.2555271", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Nov 26 16:26:30 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "Almost all of today's microprocessors contain memory controllers and directly attach to memory. Modern multiprocessor systems support non-uniform memory access (NUMA): it is faster for a microprocessor to access memory that is directly attached than it is to access memory attached to another processor. Without careful distribution of computation and data, a multithreaded program running on such a system may have high average memory access latency. To use multiprocessor systems efficiently, programmers need performance tools to guide the design of NUMA-aware codes. To address this need, we enhanced the HPCToolkit performance tools to support measurement and analysis of performance problems on multiprocessor systems with multiple NUMA domains. With these extensions, HPCToolkit helps pinpoint, quantify, and analyze NUMA bottlenecks in executions of multithreaded programs. It computes derived metrics to assess the severity of bottlenecks, analyzes memory accesses, and provides a wealth of information to guide NUMA optimization, including information about how to distribute data to reduce access latency and minimize contention. This paper describes the design and implementation of our extensions to HPCToolkit. We demonstrate their utility by describing case studies in which we use these capabilities to diagnose NUMA bottlenecks in four multithreaded applications.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "PPoPP '14 conference proceedings.", } @Article{Liu:2014:TPA, author = "Bin Liu and Yinliang Zhao and Yuxiang Li and Yanjun Sun and Boqin Feng", title = "A thread partitioning approach for speculative multithreading", journal = j-J-SUPERCOMPUTING, volume = "67", number = "3", pages = "778--805", month = mar, year = "2014", CODEN = "JOSUED", DOI = "https://doi.org/10.1007/s11227-013-1000-1", ISSN = "0920-8542 (print), 1573-0484 (electronic)", ISSN-L = "0920-8542", bibdate = "Sat Mar 8 14:59:14 MST 2014", bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=67&issue=3; https://www.math.utah.edu/pub/tex/bib/jsuper.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer.com/article/10.1007/s11227-013-1000-1", acknowledgement = ack-nhfb, fjournal = "The Journal of Supercomputing", journal-URL = "http://link.springer.com/journal/11227", } @Article{Lu:2014:EDM, author = "Kai Lu and Xu Zhou and Tom Bergan and Xiaoping Wang", title = "Efficient deterministic multithreading without global barriers", journal = j-SIGPLAN, volume = "49", number = "8", pages = "287--300", month = aug, year = "2014", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2692916.2555252", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Nov 26 16:26:30 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "Multithreaded programs execute nondeterministically on conventional architectures and operating systems. This complicates many tasks, including debugging and testing. Deterministic multithreading (DMT) makes the output of a multithreaded program depend on its inputs only, which can totally solve the above problem. However, current DMT implementations suffer from a common inefficiency: they use frequent global barriers to enforce a deterministic ordering on memory accesses. In this paper, we eliminate that inefficiency using an execution model we call deterministic lazy release consistency (DLRC). Our execution model uses the Kendo algorithm to enforce a deterministic ordering on synchronization, and it uses a deterministic version of the lazy release consistency memory model to propagate memory updates across threads. Our approach guarantees that programs execute deterministically even when they contain data races. We implemented a DMT system based on these ideas (RFDet) and evaluated it using 16 parallel applications. Our implementation targets C/C++ programs that use POSIX threads. Results show that RFDet gains nearly 2x speedup compared with DThreads-a start-of-the-art DMT system.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "PPoPP '14 conference proceedings.", } @Article{Maiya:2014:RDA, author = "Pallavi Maiya and Aditya Kanade and Rupak Majumdar", title = "Race detection for {Android} applications", journal = j-SIGPLAN, volume = "49", number = "6", pages = "316--325", month = jun, year = "2014", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2666356.2594311", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Fri Sep 26 07:38:28 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "Programming environments for smartphones expose a concurrency model that combines multi-threading and asynchronous event-based dispatch. While this enables the development of efficient and feature-rich applications, unforeseen thread interleavings coupled with non-deterministic reorderings of asynchronous tasks can lead to subtle concurrency errors in the applications. In this paper, we formalize the concurrency semantics of the Android programming model. We further define the happens-before relation for Android applications, and develop a dynamic race detection technique based on this relation. Our relation generalizes the so far independently studied happens-before relations for multi-threaded programs and single-threaded event-driven programs. Additionally, our race detection technique uses a model of the Android runtime environment to reduce false positives. We have implemented a tool called DroidRacer. It generates execution traces by systematically testing Android applications and detects data races by computing the happens-before relation on the traces. We analyzed 15 Android applications including popular applications such as Facebook, Twitter and K-9 Mail. Our results indicate that data races are prevalent in Android applications, and that DroidRacer is an effective tool to identify data races.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", received = "PLDI '14 conference proceedings.", } @Article{Martinsen:2014:HTL, author = "Jan Kasper Martinsen and Hakan Grahn and Anders Isberg", title = "Heuristics for Thread-Level Speculation in {Web} Applications", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "13", number = "2", pages = "77--80", month = jul # "\slash " # dec, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2013.26", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/java2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "JavaScript is a sequential programming language, and Thread-Level Speculation has been proposed to dynamically extract parallelism in order to take advantage of parallel hardware. In previous work, we have showed significant speed-ups with a simple on/off speculation heuristic. In this paper, we propose and evaluate three heuristics for dynamically adapt the speculation: a 2-bit heuristic, an exponential heuristic, and a combination of these two. Our results show that the combined heuristic is able to both increase the number of successful speculations and decrease the execution time for 15 popular web applications.", acknowledgement = ack-nhfb, affiliation = "Martinsen, JK (Reprint Author), Blekinge Inst Technol, Sch Comp, SE-37179 Karlskrona, Sweden. Martinsen, Jan Kasper; Grahn, Hakan, Blekinge Inst Technol, Sch Comp, SE-37179 Karlskrona, Sweden. Isberg, Anders, Sony Mobile Commun AB, SE-22188 Lund, Sweden.", author-email = "Jan.Kasper.Martinsen@bth.se Hakan.Grahn@bth.se Anders.Isberg@sonymobile.com", da = "2019-06-20", doc-delivery-number = "AX5PM", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Industrial Excellence Center EASE - Embedded Applications Software Engineering; BESQ+ research project --- Knowledge Foundation in Sweden [20100311]", funding-text = "This work was partly funded by the Industrial Excellence Center EASE --- Embedded Applications Software Engineering, (http://ease.cs.lth.se), and the BESQ+ research project funded by the Knowledge Foundation (grant number 20100311) in Sweden.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "2-bit heuristic; Automatic Parallelization; Benchmark testing; C.1.4 Parallel Architectures; C.1.4.f Speculative multi-threading; exponential heuristic; Instruction sets; Internet; Java; JavaScript; Multicore processors; Multithreading; Parallel Computing; parallel hardware; Parallel processing; parallel programming; sequential programming language; Social network services; thread-level speculation; Web applications", number-of-cited-references = "12", oa = "Green Published", ORCID-numbers = "Martinsen, Jan Kasper/0000-0001-8915-3633 Grahn, Hakan/0000-0001-9947-1088", research-areas = "Computer Science", times-cited = "2", unique-id = "Martinsen:2014:HTL", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Morishima:2014:PEG, author = "Shin Morishima and Hiroki Matsutani", title = "Performance Evaluations of Graph Database using {CUDA} and {OpenMP} Compatible Libraries", journal = j-COMP-ARCH-NEWS, volume = "42", number = "4", pages = "75--80", year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2693714.2693728", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Dec 3 16:18:50 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/pvm.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Graph databases use graph structures to store data sets as nodes, edges, and properties. They are used to store and search the relationships between a large number of nodes, such as social networking services and recommendation engines that use customer social graphs. Since computation cost for graph search queries increases as the graph becomes large, in this paper we accelerate the graph search functions (Dijkstra and A* algorithms) of a graph database Neo4j using two ways: multithreaded library and CUDA library for graphics processing units (GPUs). We use 100,000-node graphs generated based on a degree distribution of Facebook social graph for evaluations. Our multi-threaded and GPU-based implementations require an auxiliary adjacency matrix for a target graph. The results show that, when we do not take into account additional overhead to generate the auxiliary adjacency matrix, multi-threaded version improves the Dijkstra and A* search performance by 16.2x and 13.8x compared to the original implementation. The GPU-based implementation improves the Dijkstra and A* search performance by 26.2x and 32.8x. When we take into account the overhead, although the speed-ups by our implementations are reduced, by reusing the auxiliary adjacency matrix for multiple graph search queries we can significantly improve the graph search performance.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", remark = "HEART '14 conference proceedings.", } @Article{Mushtaq:2014:EHP, author = "Hamid Mushtaq and Zaid Al-Ars and Koen Bertels", title = "Efficient and highly portable deterministic multithreading {(DetLock)}", journal = j-COMPUTING, volume = "96", number = "12", pages = "1131--1147", month = dec, year = "2014", CODEN = "CMPTA2", DOI = "https://doi.org/10.1007/s00607-013-0370-9", ISSN = "0010-485X (print), 1436-5057 (electronic)", ISSN-L = "0010-485X", bibdate = "Wed Feb 11 07:42:26 MST 2015", bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0010-485X&volume=96&issue=12; https://www.math.utah.edu/pub/tex/bib/computing.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer.com/article/10.1007/s00607-013-0370-9", acknowledgement = ack-nhfb, fjournal = "Computing", journal-URL = "http://link.springer.com/journal/607", } @Article{Ngo:2014:EVC, author = "Tri Minh Ngo and Mari{\"e}lle Stoelinga and Marieke Huisman", title = "Effective verification of confidentiality for multi-threaded programs", journal = j-J-COMP-SECUR, volume = "22", number = "2", pages = "269--300", month = "????", year = "2014", CODEN = "JCSIET", DOI = "https://doi.org/10.3233/JCS-130492", ISSN = "0926-227X (print), 1875-8924 (electronic)", ISSN-L = "0926-227X", bibdate = "Tue May 24 06:26:12 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/jcompsecur.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Journal of Computer Security", journal-URL = "http://content.iospress.com/journals/journal-of-computer-security", } @Article{Niewiadomski:2014:SVG, author = "Artur Niewiadomski and Jaroslaw Skaruz and Wojciech Penczek and Maciej Szreter and Mariusz Jarocki", title = "{SMT} Versus Genetic and {OpenOpt} Algorithms: Concrete Planning in the {PlanICS} Framework", journal = j-FUND-INFO, volume = "135", number = "4", pages = "451--466", month = oct, year = "2014", CODEN = "FUMAAJ", DOI = "https://doi.org/10.3233/FI-2014-1134", ISSN = "0169-2968 (print), 1875-8681 (electronic)", ISSN-L = "0169-2968", bibdate = "Sat Mar 5 17:20:06 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/fundinfo2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Fundamenta Informaticae", journal-URL = "http://content.iospress.com/journals/fundamenta-informaticae", } @Article{Niu:2014:MCF, author = "Ben Niu and Gang Tan", title = "Modular control-flow integrity", journal = j-SIGPLAN, volume = "49", number = "6", pages = "577--587", month = jun, year = "2014", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2666356.2594295", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Fri Sep 26 07:38:28 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "Control-Flow Integrity (CFI) is a software-hardening technique. It inlines checks into a program so that its execution always follows a predetermined Control-Flow Graph (CFG). As a result, CFI is effective at preventing control-flow hijacking attacks. However, past fine-grained CFI implementations do not support separate compilation, which hinders its adoption. We present Modular Control-Flow Integrity (MCFI), a new CFI technique that supports separate compilation. MCFI allows modules to be independently instrumented and linked statically or dynamically. The combined module enforces a CFG that is a combination of the individual modules' CFGs. One challenge in supporting dynamic linking in multithreaded code is how to ensure a safe transition from the old CFG to the new CFG when libraries are dynamically linked. The key technique we use is to have the CFG represented in a runtime data structure and have reads and updates of the data structure wrapped in transactions to ensure thread safety. Our evaluation on SPECCPU2006 benchmarks shows that MCFI supports separate compilation, incurs low overhead of around 5\%, and enhances security.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", received = "PLDI '14 conference proceedings.", } @Article{Odaira:2014:EGI, author = "Rei Odaira and Jose G. Castanos and Hisanobu Tomari", title = "Eliminating global interpreter locks in {Ruby} through hardware transactional memory", journal = j-SIGPLAN, volume = "49", number = "8", pages = "131--142", month = aug, year = "2014", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2692916.2555247", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Nov 26 16:26:30 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "Many scripting languages use a Global Interpreter Lock (GIL) to simplify the internal designs of their interpreters, but this kind of lock severely lowers the multi-thread performance on multi-core machines. This paper presents our first results eliminating the GIL in Ruby using Hardware Transactional Memory (HTM) in the IBM zEnterprise EC12 and Intel 4th Generation Core processors. Though prior prototypes replaced a GIL with HTM, we tested realistic programs, the Ruby NAS Parallel Benchmarks (NPB), the WEBrick HTTP server, and Ruby on Rails. We devised a new technique to dynamically adjust the transaction lengths on a per-bytecode basis, so that we can optimize the likelihood of transaction aborts against the relative overhead of the instructions to begin and end the transactions. Our results show that HTM achieved 1.9- to 4.4-fold speedups in the NPB programs over the GIL with 12 threads, and 1.6- and 1.2-fold speedups in WEBrick and Ruby on Rails, respectively. The dynamic transaction-length adjustment chose the best transaction lengths for any number of threads and applications with sufficiently long running times.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "PPoPP '14 conference proceedings.", } @Article{Peternier:2014:IEU, author = "Achille Peternier and Danilo Ansaloni and Daniele Bonetta and Cesare Pautasso and Walter Binder", title = "Improving execution unit occupancy on {SMT}-based processors through hardware-aware thread scheduling", journal = j-FUT-GEN-COMP-SYS, volume = "30", number = "??", pages = "229--241", month = jan, year = "2014", CODEN = "FGSEVI", ISSN = "0167-739X (print), 1872-7115 (electronic)", ISSN-L = "0167-739X", bibdate = "Mon Dec 2 16:57:46 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/futgencompsys.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.sciencedirect.com/science/article/pii/S0167739X13001295", acknowledgement = ack-nhfb, fjournal = "Future Generation Computer Systems", journal-URL = "http://www.sciencedirect.com/science/journal/0167739X", } @Article{Petrovic:2014:LHM, author = "Darko Petrovi{\'c} and Thomas Ropars and Andr{\'e} Schiper", title = "Leveraging hardware message passing for efficient thread synchronization", journal = j-SIGPLAN, volume = "49", number = "8", pages = "143--154", month = aug, year = "2014", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2692916.2555251", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Nov 26 16:26:30 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "As the level of parallelism in manycore processors keeps increasing, providing efficient mechanisms for thread synchronization in concurrent programs is becoming a major concern. On cache-coherent shared-memory processors, synchronization efficiency is ultimately limited by the performance of the underlying cache coherence protocol. This paper studies how hardware support for message passing can improve synchronization performance. Considering the ubiquitous problem of mutual exclusion, we adapt two state-of-the-art solutions used on shared-memory processors, namely the server approach and the combining approach, to leverage the potential of hardware message passing. We propose HybComb, a novel combining algorithm that uses both message passing and shared memory features of emerging hybrid processors. We also introduce MP-Server, a straightforward adaptation of the server approach to hardware message passing. Evaluation on Tilera's TILE-Gx processor shows that MP-Server can execute contended critical sections with unprecedented throughput, as stalls related to cache coherence are removed from the critical path. HybComb can achieve comparable performance, while avoiding the need to dedicate server cores. Consequently, our queue and stack implementations, based on MP-Server and HybComb, largely outperform their most efficient pure-shared-memory counterparts.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "PPoPP '14 conference proceedings.", } @Article{Pricopi:2014:TSA, author = "M. Pricopi and T. Mitra", title = "Task Scheduling on Adaptive Multi-Core", journal = j-IEEE-TRANS-COMPUT, volume = "63", number = "10", pages = "2590--2603", month = oct, year = "2014", CODEN = "ITCOB4", DOI = "https://doi.org/10.1109/TC.2013.115", ISSN = "0018-9340 (print), 1557-9956 (electronic)", ISSN-L = "0018-9340", bibdate = "Thu Nov 06 07:29:34 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Computers", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12", keywords = "adaptive architectures; adaptive multi-cores; adaptive multicore architectures; core allocation; dynamic heterogeneous multi-core; embedded domain; general-purpose computing; ILP; instruction-level parallelism; malleable and moldable tasks; multi-threading; offline scheduler; on-chip cores; online scheduler; parallel applications; parallel architectures; power constraints; resource allocation; resource allocation problems; Scheduling; scheduling; sequential application; sequential code; sequential fragments; task scheduling; thermal constraints; thread-level parallelism; TLP", } @Article{Pusukuri:2014:LCA, author = "Kishore Kumar Pusukuri and Rajiv Gupta and Laxmi Narayan Bhuyan", title = "Lock contention aware thread migrations", journal = j-SIGPLAN, volume = "49", number = "8", pages = "369--370", month = aug, year = "2014", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2692916.2555273", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Nov 26 16:26:30 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "On a cache-coherent multicore multiprocessor system, the performance of a multithreaded application with high lock contention is very sensitive to the distribution of application threads across multiple processors. This is because the distribution of threads impacts the frequency of lock transfers between processors, which in turn impacts the frequency of last-level cache (LLC) misses that lie on the critical path of execution. Inappropriate distribution of threads across processors increases LLC misses in the critical path and significantly degrades performance of multithreaded programs. To alleviate the above problem, this paper overviews a thread migration technique, which migrates threads of a multithreaded program across multicore processors so that threads seeking locks are more likely to find the locks on the same processor.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "PPoPP '14 conference proceedings.", } @Article{Qian:2014:PRR, author = "Xuehai Qian and Benjamin Sahelices and Depei Qian", title = "{Pacifier}: record and replay for relaxed-consistency multiprocessors with distributed directory protocol", journal = j-COMP-ARCH-NEWS, volume = "42", number = "3", pages = "433--444", month = jun, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2678373.2665736", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Dec 3 16:18:50 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Record and Deterministic Replay (R\&R) of multithreaded programs on relaxed-consistency multiprocessors with distributed directory protocol has been a long-standing open problem. The independently developed RelaxReplay [8] solves the problem by assuming write atomicity. This paper proposes Pacifier, the first R\&R scheme to provide a solution without assuming write atomicity. R\&R for relaxed-consistency multiprocessors needs to detect, record and replay Sequential Consistency Violations (SCV). Pacifier has two key components: (i) Relog, a general memory reordering logging and replay mechanism that can reproduce SCVs in relaxed memory models, and (ii) Granule, an SCV detection scheme in the record phase with good precision, that indicates whether to record with Relog. We show that Pacifier is a sweet spot in the design space with a reasonable trade-off between hardware and log overhead. An evaluation with simulations of 16, 32 and 64 processors with Release Consistency (RC) running SPLASH-2 applications indicates that Pacifier incurs 3.9\% ~ 16\% larger logs. The slowdown of Pacifier during replay is 10.1\% ~ 30.5\% compared to native execution", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", remark = "ISCA '14 conference proceedings.", } @Article{Rahman:2014:CCO, author = "Musfiq Rahman and Bruce R. Childers and Sangyeun Cho", title = "{COMeT+}: Continuous Online Memory Testing with Multi-Threading Extension", journal = j-IEEE-TRANS-COMPUT, volume = "63", number = "7", pages = "1668--1681", month = jul, year = "2014", CODEN = "ITCOB4", DOI = "https://doi.org/10.1109/TC.2013.65", ISSN = "0018-9340 (print), 1557-9956 (electronic)", ISSN-L = "0018-9340", bibdate = "Mon Aug 25 08:24:32 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Computers", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12", } @Article{Ribic:2014:EEW, author = "Haris Ribic and Yu David Liu", title = "Energy-efficient work-stealing language runtimes", journal = j-COMP-ARCH-NEWS, volume = "42", number = "1", pages = "513--528", month = mar, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2654822.2541971", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Aug 18 17:12:47 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Work stealing is a promising approach to constructing multithreaded program runtimes of parallel programming languages. This paper presents HERMES, an energy-efficient work-stealing language runtime. The key insight is that threads in a work-stealing environment --- thieves and victims --- have varying impacts on the overall program running time, and a coordination of their execution ``tempo'' can lead to energy efficiency with minimal performance loss. The centerpiece of HERMES is two complementary algorithms to coordinate thread tempo: the workpath-sensitive algorithm determines tempo for each thread based on thief-victim relationships on the execution path, whereas the workload-sensitive algorithm selects appropriate tempo based on the size of work-stealing deques. We construct HERMES on top of Intel Cilk Plus's runtime, and implement tempo adjustment through standard Dynamic Voltage and Frequency Scaling (DVFS). Benchmarks running on HERMES demonstrate an average of 11-12\% energy savings with an average of 3-4\% performance loss through meter-based measurements over commercial CPUs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", remark = "ASPLOS '14 conference proceedings.", } @Article{Rogers:2014:LYL, author = "Timothy G. Rogers and Mike O'Connor and Tor M. Aamodt", title = "Learning your limit: managing massively multithreaded caches through scheduling", journal = j-CACM, volume = "57", number = "12", pages = "91--98", month = dec, year = "2014", CODEN = "CACMA2", DOI = "https://doi.org/10.1145/2682583", ISSN = "0001-0782 (print), 1557-7317 (electronic)", ISSN-L = "0001-0782", bibdate = "Thu Jan 22 08:42:40 MST 2015", bibsource = "http://www.acm.org/pubs/contents/journals/cacm/; https://www.math.utah.edu/pub/tex/bib/cacm2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://cacm.acm.org/magazines/2014/12/180789/fulltext", abstract = "The gap between processor and memory performance has become a focal point for microprocessor research and development over the past three decades. Modern architectures use two orthogonal approaches to help alleviate this issue: (1) Almost every microprocessor includes some form of on-chip storage, usually in the form of caches, to decrease memory latency and make more effective use of limited memory bandwidth. (2) Massively multithreaded architectures, such as graphics processing units (GPUs), attempt to hide the high latency to memory by rapidly switching between many threads directly in hardware. This paper explores the intersection of these two techniques. We study the effect of accelerating highly parallel workloads with significant locality on a massively multithreaded GPU. We observe that the memory access stream seen by on-chip caches is the direct result of decisions made by the hardware thread scheduler. Our work proposes a hardware scheduling technique that reacts to feedback from the memory system to create a more cache-friendly access stream. We evaluate our technique using simulations and show a significant performance improvement over previously proposed scheduling mechanisms. We demonstrate the effectiveness of scheduling as a cache management technique by comparing cache hit rate using our scheduler and an LRU replacement policy against other scheduling techniques using an optimal cache replacement policy.", acknowledgement = ack-nhfb, fjournal = "Communications of the ACM", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J79", } @Article{Samak:2014:MTS, author = "Malavika Samak and Murali Krishna Ramanathan", title = "Multithreaded test synthesis for deadlock detection", journal = j-SIGPLAN, volume = "49", number = "10", pages = "473--489", month = oct, year = "2014", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2714064.2660238", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue May 12 17:41:21 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "Designing and implementing thread-safe multithreaded libraries can be a daunting task as developers of these libraries need to ensure that their implementations are free from concurrency bugs, including deadlocks. The usual practice involves employing software testing and/or dynamic analysis to detect deadlocks. Their effectiveness is dependent on well-designed multithreaded test cases. Unsurprisingly, developing multithreaded tests is significantly harder than developing sequential tests for obvious reasons. In this paper, we address the problem of automatically synthesizing multithreaded tests that can induce deadlocks. The key insight to our approach is that a subset of the properties observed when a deadlock manifests in a concurrent execution can also be observed in a single threaded execution. We design a novel, automatic, scalable and directed approach that identifies these properties and synthesizes a deadlock revealing multithreaded test. The input to our approach is the library implementation under consideration and the output is a set of deadlock revealing multithreaded tests. We have implemented our approach as part of a tool, named OMEN$^1$. OMEN is able to synthesize multithreaded tests on many multithreaded Java libraries. Applying a dynamic deadlock detector on the execution of the synthesized tests results in the detection of a number of deadlocks, including 35 real deadlocks in classes documented as thread-safe. Moreover, our experimental results show that dynamic analysis on multithreaded tests that are either synthesized randomly or developed by third-party programmers are ineffective in detecting the deadlocks.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "OOPSLA '14 conference proceedings.", } @Book{Schildt:2014:JCR, editor = "Herbert Schildt", title = "{Java}: The Complete Reference", publisher = pub-MCGRAW-HILL, address = pub-MCGRAW-HILL:adr, edition = "Ninth", pages = "xxxiv + 1274", year = "2014", ISBN = "0-07-180855-8 (paperback), 0-07-180925-2, 0-07-180856-6", ISBN-13 = "978-0-07-180855-2, 978-0-07-180925-2, 978-0-07-180856-9", LCCN = "QA76.73.J38 S332 2014eb", bibdate = "Thu Dec 4 13:05:57 MST 2014", bibsource = "fsz3950.oclc.org:210/WorldCat; https://www.math.utah.edu/pub/tex/bib/java2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Fully updated for Java SE 8, this edition explains how to develop, compile, debug, and run Java programs. The book covers the entire Java language, including its syntax, keywords, and fundamental programming principles, as well as significant portions of the Java API library. JavaBeans, servlets, applets, and Swing are examined and real-world examples demonstrate Java in action. New Java SE 8 features such as lambda expressions, the stream library, and the default interface method are discussed in detail. This Oracle Press resource also offers a solid introduction to JavaFX. Topics covered include: data types, variables, arrays, and operators; control statements; classes, objects, and methods; method overloading and overriding; inheritance; interfaces and packages; exception handling; multithreaded programming; enumerations, autoboxing, and annotations; I/O classes; generics; lambda expressions; string handlin; collections framework; networking; event handling; AWT and Swing; concurrent and stream API; regular expressions; JavaFX; JavaBeans; and applets and servlets.", acknowledgement = ack-nhfb, shorttableofcontents = "The history and evolution of Java \\ An overview of Java \\ Data types, variables, and arrays \\ Operators \\ Control statements \\ Introducing classes \\ A closer look at methods and classes \\ Inheritance \\ Packages and interfaces \\ Exception handling \\ Multithreaded programming \\ Enumerations, autoboxing, and annotations (metadata) \\ I/O, applets, and other topics \\ Generics \\ Lambda expressions \\ String handling \\ Exploring java.lang \\ Java.util part 1: the collections framework \\ Java.util part 2: more utility classes \\ Input/output: exploring java.io \\ Exploring NIO \\ Networking \\ The applet class \\ Event handling \\ Introducing the AWT: working with windows, graphics, and text \\ Using AWT controls, layout managers, and menus \\ Images \\ The concurrency utilities \\ The stream API \\ Regular expressions and other packages \\ Introducing swing \\ Exploring swing \\ Introducing swing menus \\ Introducing JavaFX GUI programming \\ Exploring JavaFX controls \\ Introducing JavaFX menus \\ Java beans \\ Introducing servlets \\ Using Java's documentation comments", subject = "Java (Langage de programmation); Programmation Internet; Java (Computer program language); Internet programming; Internet programming.; Java (Computer program language)", tableofcontents = "Part I. The Java language \\ 1. The history and evolution of Java: Java's lineage; The creation of Java; How Java changed the Internet; Java's magic: the bytecode; Servlets: Java on the server side; The Java buzzwords; The evolution of Java; Java SE 8; A culture of innovation \\ 2. An overview of Java: Object-oriented programming; A first simple program; A second short program; Two control statements; Using blocks of code; Lexical issues; The Java class libraries \\ 3. Data types, variables, and arrays: Java is a strongly typed language; The primitive types; Integers; Floating-point types; Characters; Booleans; A closer look at literals; Variables; Type conversion and casting; Automatic type promotion in expressions; Arrays; A few words about strings; A note to C/C++ programmers about pointers \\ 4. Operators: Arithmetic operators; The bitwise operators; Relational operators; Boolean logical operators; The assignment operator; The ? operator; Operator precedence; Using parentheses \\ 5. Control statements: Java's selection statements; Iteration statements; Jump statements \\ 6. Introducing classes: Class fundamentals; Declaring objects; Assigning object reference variables; Introducing methods; Constructors; The this keyword; Garbage collection; The finalize() method; A stack class \\ 7. A closer look at methods and classes: Overloading methods; Using objects as parameters; A closer look at argument passing; Returning objects; Recursion; Introducing access control; Understanding static; Introducing final; Arrays revisited; Introducing nested and inner classes; Exploring the string class; Using command-line arguments; Varargs: variable-length arguments \\ 8. Inheritance: Inheritance basics; Using super; Creating a multilevel hierarchy; When constructors are executed; Method overriding; Dynamic method dispatch; Using abstract classes; Using final with inheritance; The object class \\ 9. Packages and interfaces: Packages; Access protection; Importing packages; Interfaces; Default interface methods; Use static methods in an interface; Final thoughts on packages and interfaces \\ 10. Exception handling: Exception-handling fundamentals; Exception types; Uncaught exceptions; Using try and catch; Multiple catch clauses; Nested try statements; Throw; Throws; Finally; Java's build-in exceptions; Creating your own exception subclasses; Chained exceptions; Three recently added exception features; Using exceptions \\ 11. Multithreaded programming: The Java thread model; The main thread; Creating a thread; Creating multiple threads; Using isAlive() and join(); Thread priorities; Synchronization; Interthread communication; Suspending, resuming, and stopping threads; Obtaining a thread's state; Using multithreading \\ 12. Enumerations, autoboxing, and annotations (metadata): Enumerations; Type wrappers; Autoboxing; Annotations (metadata); Type annotations; Repeating annotations \\ 13. I/O, applets, and other topics: I/O basics; Reading console input; Writing console output; The PrintWriter class; Reading and writing files; Automatically closing a file; Applet fundamentals; The transient and volatile modifiers; Using instanceof; Strictfp; Native methods; Problems with native methods; Using assert; Static import; Invoking overloaded constructors through this(); Compact API profiles \\ 14. Generics: What are generics?; A simple generics example; A generic class with two type parameters; The general form of a generic class; Bounded types; Using wildcard arguments; Creating a generic method; Generic interfaces; Raw types and legacy code; Generic class hierarchies; Type inference with generics; Erasure; Ambiguity errors; Some generic restrictions \\ 15. Lambda expressions: Introducing lambda expressions; Block lambda expressions; Generic functional interfaces; Passing lambda expressions as arguments; Lambda expressions and exceptions; Lambda expressions and variable capture; Method references; Constructor references; Predefined functional interfaces \\ Part II. The Java library. \\ 16. String handling: The string constructors; String length; Special string operations; Character extraction; String comparison; Searching strings; Modifying a string; Data conversion using valueOf(); Changing the case of characters within a string; Joining strings; Additional string methods; StringBuffer; StringBuilder \\ 17. Exploring java.lang: Primitive type wrappers; Void; Process; Runtime; ProcessBuilder; System; Object; Using clone() and the cloneable interface; Class; ClassLoader; Math; StrictMath; Compiler; Thread, ThreadGroup and runnable; ThreadLocal and InheritableThreadLocal; Package; RuntimePermission; Throwable; SecurityManager; StackTraceElement; Enum; ClassValue; The CharSequence interface; The comparable interface; The appendable interface; The iterable interface; The readable interface; The AutoCloseable interface; The Thread.UncaughtExceptionHandler interface; The java.lang subpackages \\ 18. java.util Part 1: The collections framework: Collections overview; JDK 5 changed the collections framework; The collection interfaces; The collection classes; Accessing a collection via an iterator; Spliterators; Storing user-defined classes in collections; The RandomAccess interface; Working with maps; Comparators; The collection algorithms; Arrays; The legacy classes and interfaces; Parting thoughts on collections \\ 19. java.util Part 2: More utility classes: StringTokenizer; BitSet; Optional, OptionalDouble, OptionalInt, and OptionalLong; Date; Calendar; GregorianCalendar; TimeZone; SimpleTimeZone; Locale; Random; Observable; Timer and TimerTask; Currency; Formatter; Scanner; The ResourceBundle, ListResourceBundle, and PropertyResourceBundle classes; Miscellaneous utility classes and interfaces; The java.util subpackages \\ 20. Input/output: exploring java.io: The I/O classes and interfaces; File; The AutoCloseable, Closeable, and flushable interfaces; I/O exceptions; Two ways to close a stream; The stream classes; The byte streams; The character streams; The console class; Serialization; Stream benefits \\ 21. Exploring NIO: The NIO classes; NIO fundamentals; Enhancements added to NIO by JDK 7; Using the NIO system; Pre-JDK 7 channel-based examples \\ 22. Networking: Networking basics; The networking classes and interfaces; Inet/Address; Inet4Address and Inet6Address; TCP/IP client sockets; URL; URLConnection; HttpURLConnection; The URI class; Cookies; TCP/IP server sockets; Datagrams \\ 23. The applet class: Two types of applets; Applet basics; Applet architecture; An applet skeleton; Simple applet display methods; Requesting repainting; Using the status window; The HTML APPLET tag; Passing parameters to applets; getDocumentBase() and getCodeBase(); AppletContext and showDocument(); The AudioClip interface; The AppletStub interface; Outputting to the console \\ 24. Event handling: Two event handling mechanisms; The delegation event model; Event classes; The KeyEvent class; Sources of events; Event listener interfaces; Using the delegation event model; Adapter classes; Inner classes \\ 25. Introducing the AWT: working with windows, graphics, and text: AWT classes; Window fundamentals; Working with frame windows; Creating a frame window in an AWT-based applet; Creating a windowed program; Displaying information within a window; Introducing graphics; Working with color; Setting the paint mode; Working with fonts; Managing text output using FontMetrics \\ 26. Using AWT controls, layout managers, and menus: AWT control fundamentals; Labels; Using buttons; Applying check boxes; CheckboxGroup; Choice controls; Using lists; Managing scroll bars; Using a TextField; Using a TextArea; Understanding layout managers; Menu bars and menus; Dialog boxes; FileDialog; A word about overriding paint() \\ 27. Images: File formats; Image fundamentals: creating, loading, and displaying; ImageObserver; Double buffering; MediaTracker; ImageProducer; ImageConsumer; ImageFilter; Additional imaging classes \\ 28. The concurrency utilities: The concurrent API packages; Using synchronization objects; Phaser; Using an executor; The TimeUnit enumeration; the concurrent collections; Locks; Atomic operations; Parallel programming via the fork/join framework; The concurrency utilities versus Java's traditional approach \\ 29. The stream API: Stream basics; Reduction operations; Using parallel streams; Mapping; Collecting; Iterators and streams; More to explore in the stream API \\ 30. Regular expressions and other packages: The core Java API packages; Regular expression processing; Reflection; Remote method invocation (RMI); Formatting date and time with java.text; The time and date API added by JDK 8 \\ Part III. Introducing GUI programming with swing \\ 31. Introducing swing: The origins of swing; Swing is built on the AWT; Two key swing features; The MVC connection; Components and containers; The swing packages; A simple swing application; Event handling; Create a swing applet; Painting in swing \\ 32. Exploring swing: JLabel and ImageIcon; JTextField; The swing buttons; JTabbedPane; JScrollPane; JList; JComboBox; Trees; JTable \\ 33. Introducing swing menus: Menu basics; An overview of JMenuBar, JMenu, and JMenuItem; Create a main menu; Add Mnemonics and accelerators to menu items; Add images and tooltips to menu items; Use JRadioButtonMenuItem and JCheckBoxMenuItem; Create a popup menu; Create a toolbar; Use actions; Put the entire MenuDemo program together; Continuing your exploration of swing \\ Part IV. Introducing GUI programming with JavaFX \\ 34. Introducing JavaFX GUI programming: JavaFX basic concepts; A JavaFX application skeleton; Compiling and running a JavaFX program; The application thread; A simple JavaFX control: label; Using buttons and events; Drawing directly on a canvas \\ 35. Exploring JavaFX controls: Using image and ImageView; ToggleButton; RadioButton; CheckBox; ListView; ComboBox; TextField; ScrollPane; TreeView; Introducing effects and transforms; Adding tooltips; Disabling a control \\ 36. Introducing JavaFX menus: Menu basics; An overview of MenuBar, Menu, and MenuItem; Create a main menu; Add mnemonics and accelerators to menu items; Add images to menu items; Use RadioMenuItem and CheckMenuItem; Create a context menu; Create a toolbar; Put the entire MenuDemo program together; Continuing your exploration of JavaFX \\ Part V. Applying Java \\ 37. Java beans: What is a Java bean?; Advantages of Java beans; Introspection; Bound and constrained properties; Persistence; Customizers; The Java beans API; A bean example \\ 38. Introducing servlets: Background; The life cycle of a servlet; Servlet development options; Using Tomcat; A simple servlet; The servlet API; The javax.servlet package; Reading servlet parameters; The javax.servlet.http package; Handling HTTP requests and responses; Using cookies; Session tracking \\ Appendix. Using Java's documentation comments: The javadoc tags; The general form of a documentation comment; What javadoc outputs; An example that uses documentation comments", } @Article{Shih:2014:COR, author = "Wen-Li Shih and Yi-Ping You and Chung-Wen Huang and Jenq Kuen Lee", title = "Compiler Optimization for Reducing Leakage Power in Multithread {BSP} Programs", journal = j-TODAES, volume = "20", number = "1", pages = "9:1--9:??", month = nov, year = "2014", CODEN = "ATASFO", DOI = "https://doi.org/10.1145/2668119", ISSN = "1084-4309 (print), 1557-7309 (electronic)", ISSN-L = "1084-4309", bibdate = "Wed Nov 19 11:18:40 MST 2014", bibsource = "http://www.acm.org/pubs/contents/journals/todaes/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/todaes.bib", abstract = "Multithread programming is widely adopted in novel embedded system applications due to its high performance and flexibility. This article addresses compiler optimization for reducing the power consumption of multithread programs. A traditional compiler employs energy management techniques that analyze component usage in control-flow graphs with a focus on single-thread programs. In this environment the leakage power can be controlled by inserting on and off instructions based on component usage information generated by flow equations. However, these methods cannot be directly extended to a multithread environment due to concurrent execution issues. This article presents a multithread power-gating framework composed of multithread power-gating analysis (MTPGA) and predicated power-gating (PPG) energy management mechanisms for reducing the leakage power when executing multithread programs on simultaneous multithreading (SMT) machines. Our multithread programming model is based on hierarchical bulk-synchronous parallel (BSP) models. Based on a multithread component analysis with dataflow equations, our MTPGA framework estimates the energy usage of multithread programs and inserts PPG operations as power controls for energy management. We performed experiments by incorporating our power optimization framework into SUIF compiler tools and by simulating the energy consumption with a post-estimated SMT simulator based on Wattch toolkits. The experimental results show that the total energy consumption of a system with PPG support and our power optimization method is reduced by an average of 10.09\% for BSP programs relative to a system without a power-gating mechanism on leakage contribution set to 30\%; and the total energy consumption is reduced by an average of 4.27\% on leakage contribution set to 10\%. The results demonstrate our mechanisms are effective in reducing the leakage energy of BSP multithread programs.", acknowledgement = ack-nhfb, articleno = "9", fjournal = "ACM Transactions on Design Automation of Electronic Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J776", } @Article{Sridharan:2014:AEP, author = "Srinath Sridharan and Gagan Gupta and Gurindar S. Sohi", title = "Adaptive, efficient, parallel execution of parallel programs", journal = j-SIGPLAN, volume = "49", number = "6", pages = "169--180", month = jun, year = "2014", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2666356.2594292", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Fri Sep 26 07:38:28 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "Future multicore processors will be heterogeneous, be increasingly less reliable, and operate in dynamically changing operating conditions. Such environments will result in a constantly varying pool of hardware resources which can greatly complicate the task of efficiently exposing a program's parallelism onto these resources. Coupled with this uncertainty is the diverse set of efficiency metrics that users may desire. This paper proposes Varuna, a system that dynamically, continuously, rapidly and transparently adapts a program's parallelism to best match the instantaneous capabilities of the hardware resources while satisfying different efficiency metrics. Varuna is applicable to both multithreaded and task-based programs and can be seamlessly inserted between the program and the operating system without needing to change the source code of either. We demonstrate Varuna's effectiveness in diverse execution environments using unaltered C/C++ parallel programs from various benchmark suites. Regardless of the execution environment, Varuna always outperformed the state-of-the-art approaches for the efficiency metrics considered.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", received = "PLDI '14 conference proceedings.", } @Article{Steele:2014:FSP, author = "Guy L. {Steele, Jr.} and Doug Lea and Christine H. Flood", title = "Fast splittable pseudorandom number generators", journal = j-SIGPLAN, volume = "49", number = "10", pages = "453--472", month = oct, year = "2014", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2714064.2660195", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue May 12 17:41:21 MDT 2015", bibsource = "https://www.math.utah.edu/pub/bibnet/authors/m/marsaglia-george.bib; https://www.math.utah.edu/pub/tex/bib/java2010.bib; https://www.math.utah.edu/pub/tex/bib/jstatsoft.bib; https://www.math.utah.edu/pub/tex/bib/mathcw.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/prng.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib; https://www.math.utah.edu/pub/tex/bib/tomacs.bib", abstract = "We describe a new algorithm SplitMix for an object-oriented and splittable pseudorandom number generator (PRNG) that is quite fast: 9 64-bit arithmetic/logical operations per 64 bits generated. A conventional linear PRNG object provides a generate method that returns one pseudorandom value and updates the state of the PRNG, but a splittable PRNG object also has a second operation, split, that replaces the original PRNG object with two (seemingly) independent PRNG objects, by creating and returning a new such object and updating the state of the original object. Splittable PRNG objects make it easy to organize the use of pseudorandom numbers in multithreaded programs structured using fork-join parallelism. No locking or synchronization is required (other than the usual memory fence immediately after object creation). Because the generate method has no loops or conditionals, it is suitable for SIMD or GPU implementation. We derive SplitMix from the DotMix algorithm of Leiserson, Schardl, and Sukha by making a series of program transformations and engineering improvements. The end result is an object-oriented version of the purely functional API used in the Haskell library for over a decade, but SplitMix is faster and produces pseudorandom sequences of higher quality; it is also far superior in quality and speed to java.util.Random, and has been included in Java JDK8 as the class java.util.SplittableRandom. We have tested the pseudorandom sequences produced by SplitMix using two standard statistical test suites (DieHarder and TestU01) and they appear to be adequate for ``everyday'' use, such as in Monte Carlo algorithms and randomized data structures where speed is important.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark-1 = "OOPSLA '14 conference proceedings.", remark-2 = "On page 466, the authors describe an interesting technique for improving a user-supplied seed that might produce insufficient randomness in the next several members of the random-number sequence: ``Long runs of 0-bits or of 1-bits in the $\gamma$ [candidate seed] value do not cause bits of the seed to flip; an approximate proxy for how many bits of the seed will flip might be the number of bit pairs of the form 01 or 10 in the candidate $\gamma$ value {\tt z}. Therefore we require that the number of such pairs, as computed by {\tt Long.bitCount(z ^ (z >>> 1))}, exceed 24; if it does not, then the candidate z is replaced by the XOR of {\tt z} and {\tt 0xaaaaaaaaaaaaaaaaL}, a constant chosen so that (a) the low bit of {\tt z} remains 1, and (b) every bit pair of the form 00 or 11 becomes either 01 or 10, and likewise every bit pair of the form 01 or 10 becomes either 00 or 11, so the new value necessarily has more than 24 bit pairs whose bits differ. Testing shows that this trick appears to be effective.''", remark-3 = "From page 468: ``we did three runs of TestU01 BigCrush on {\tt java.util.Random}; 19 tests produced clear failure on all three runs. These included 9 Birthday Spacings tests, 8 ClosePairs tests, a WeightDistrib test, and a CouponCollector test. This confirms L'Ecuyer's observation that {\tt java.util.Random} tends to fail Birthday Spacings tests [17].'' The reference is to \cite{LEcuyer:2001:SUR}.", remark-4 = "From page 470: ``[L'Ecuyer] comments, `In the Java class {\tt java.util.Random}, RNG streams can be declared and constructed dynamically, without limit on their number. However, no precaution seems to have been taken regarding the independence of these streams.'''", remark-5 = "From page 471: ``They [the generators in this paper] should not be used for cryptographic or security applications, because they are too predictable (the mixing functions are easily inverted, and two successive outputs suffice to reconstruct the internal state), \ldots{} One version seems especially suitable for use as a replacement for {\tt java.util.Random}, because it produces sequences of higher quality, is faster in sequential use, is easily parallelized for use in JDK8 stream expressions, and is amenable to efficient implementation on SIMD and GPU architectures.''", } @Article{Sung:2014:PTR, author = "I-Jui Sung and Juan G{\'o}mez-Luna and Jos{\'e} Mar{\'\i}a Gonz{\'a}lez-Linares and Nicol{\'a}s Guil and Wen-Mei W. Hwu", title = "In-place transposition of rectangular matrices on accelerators", journal = j-SIGPLAN, volume = "49", number = "8", pages = "207--218", month = aug, year = "2014", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2692916.2555266", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Nov 26 16:26:30 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "Matrix transposition is an important algorithmic building block for many numeric algorithms such as FFT. It has also been used to convert the storage layout of arrays. With more and more algebra libraries offloaded to GPUs, a high performance in-place transposition becomes necessary. Intuitively, in-place transposition should be a good fit for GPU architectures due to limited available on-board memory capacity and high throughput. However, direct application of CPU in-place transposition algorithms lacks the amount of parallelism and locality required by GPUs to achieve good performance. In this paper we present the first known in-place matrix transposition approach for the GPUs. Our implementation is based on a novel 3-stage transposition algorithm where each stage is performed using an elementary tiled-wise transposition. Additionally, when transposition is done as part of the memory transfer between GPU and host, our staged approach allows hiding transposition overhead by overlap with PCIe transfer. We show that the 3-stage algorithm allows larger tiles and achieves 3X speedup over a traditional 4-stage algorithm, with both algorithms based on our high-performance elementary transpositions on the GPU. We also show our proposed low-level optimizations improve the sustained throughput to more than 20 GB/s. Finally, we propose an asynchronous execution scheme that allows CPU threads to delegate in-place matrix transposition to GPU, achieving a throughput of more than 3.4 GB/s (including data transfers costs), and improving current multithreaded implementations of in-place transposition on CPU.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "PPoPP '14 conference proceedings.", } @Article{Tarvo:2014:AAM, author = "Alexander Tarvo and Steven P. Reiss", title = "Automated analysis of multithreaded programs for performance modeling", journal = j-SIGMETRICS, volume = "42", number = "1", pages = "557--558", month = jun, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2637364.2592016", ISSN = "0163-5999 (print), 1557-9484 (electronic)", ISSN-L = "0163-5999", bibdate = "Fri Jun 27 06:38:48 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigmetrics.bib", abstract = "We present an approach for building performance models of multithreaded programs automatically. We use a combination of static and a dynamic analyses of a single representative run of the program to build its model. The model can predict performance of the program under a variety of configurations. This paper outlines how we construct the model and demonstrates how the resultant models accurately predict the performance and resource utilization of complex multithreaded programs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGMETRICS Performance Evaluation Review", journal-URL = "http://portal.acm.org/toc.cfm?id=J618", } @Article{Turon:2014:GNW, author = "Aaron Turon and Viktor Vafeiadis and Derek Dreyer", title = "{GPS}: navigating weak memory with ghosts, protocols, and separation", journal = j-SIGPLAN, volume = "49", number = "10", pages = "691--707", month = oct, year = "2014", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2714064.2660243", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue May 12 17:41:21 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "Weak memory models formalize the inconsistent behaviors that one can expect to observe in multithreaded programs running on modern hardware. In so doing, however, they complicate the already-difficult task of reasoning about correctness of concurrent code. Worse, they render impotent the sophisticated formal methods that have been developed to tame concurrency, which almost universally assume a strong ( i.e. sequentially consistent) memory model. This paper introduces GPS, the first program logic to provide a full-fledged suite of modern verification techniques --- including ghost state, protocols, and separation logic --- for high-level, structured reasoning about weak memory. We demonstrate the effectiveness of GPS by applying it to challenging examples drawn from the Linux kernel as well as lock-free data structures. We also define the semantics of GPS and prove in Coq that it is sound with respect to the axiomatic C11 weak memory model.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "OOPSLA '14 conference proceedings.", } @Article{Wadden:2014:RWD, author = "Jack Wadden and Alexander Lyashevsky and Sudhanva Gurumurthi and Vilas Sridharan and Kevin Skadron", title = "Real-world design and evaluation of compiler-managed {GPU} redundant multithreading", journal = j-COMP-ARCH-NEWS, volume = "42", number = "3", pages = "73--84", month = jun, year = "2014", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2678373.2665686", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Dec 3 16:18:50 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Reliability for general purpose processing on the GPU (GPGPU) is becoming a weak link in the construction of reliable supercomputer systems. Because hardware protection is expensive to develop, requires dedicated on-chip resources, and is not portable across different architectures, the efficiency of software solutions such as redundant multithreading (RMT) must be explored. This paper presents a real-world design and evaluation of automatic software RMT on GPU hardware. We first describe a compiler pass that automatically converts GPGPU kernels into redundantly threaded versions. We then perform detailed power and performance evaluations of three RMT algorithms, each of which provides fault coverage to a set of structures in the GPU. Using real hardware, we show that compiler-managed software RMT has highly variable costs. We further analyze the individual costs of redundant work scheduling, redundant computation, and inter-thread communication, showing that no single component in general is responsible for high overheads across all applications; instead, certain workload properties tend to cause RMT to perform well or poorly. Finally, we demonstrate the benefit of architectural support for RMT with a specific example of fast, register-level thread communication", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", remark = "ISCA '14 conference proceedings.", } @Article{Xu:2014:STM, author = "Yunlong Xu and Rui Wang and Nilanjan Goswami and Tao Li and Depei Qian", title = "Software Transactional Memory for {GPU} Architectures", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "13", number = "1", pages = "49--52", month = jan # "\slash " # jun, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1109/L-CA.2013.4", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Jun 20 17:18:18 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "To make applications with dynamic data sharing among threads benefit from GPU acceleration, we propose a novel software transactional memory system for GPU architectures (GPU-STM). The major challenges include ensuring good scalability with respect to the massively multithreading of GPUs, and preventing livelocks caused by the SIMT execution paradigm of GPUs. To this end, we propose (1) a hierarchical validation technique and (2) an encounter-time lock-sorting mechanism to deal with the two challenges, respectively. Evaluation shows that GPU-STM outperforms coarse-grain locks on GPUs by up to 20x.", acknowledgement = ack-nhfb, affiliation = "Xu, YL (Reprint Author), Xi An Jiao Tong Univ, Sch Elect \& Informat Engn, Xian 710049, Peoples R China. Xu, Yunlong; Qian, Depei, Xi An Jiao Tong Univ, Sch Elect \& Informat Engn, Xian 710049, Peoples R China. Wang, Rui; Qian, Depei, Beihang Univ, Sch Engn \& Comp Sci, Beijing, Peoples R China. Goswami, Nilanjan; Li, Tao, Univ Florida, ECE Dept, Gainesville, FL USA.", author-email = "xjtu.ylxu@stu.xjtu.edu.cn rui.wang@jsi.buaa.edu.cn nil@ufl.edu taoli@ece.ufl.edu depeiq@xjtu.edu.cn", da = "2019-06-20", doc-delivery-number = "AT5MU", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "NSF of China [61133004, 61128004, 61073011]; 863 Program of China [2012AA010902]", funding-text = "This work is supported by NSF of China under grant 61133004, 61128004 and 61073011, and 863 Program of China under grant 2012AA010902.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Multicore Processors; Parallel Programming; Run-time Environments; SIMD Processors", number-of-cited-references = "11", research-areas = "Computer Science", times-cited = "1", unique-id = "Xu:2014:STM", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Yang:2014:CNR, author = "Yi Yang and Huiyang Zhou", title = "{CUDA-NP}: realizing nested thread-level parallelism in {GPGPU} applications", journal = j-SIGPLAN, volume = "49", number = "8", pages = "93--106", month = aug, year = "2014", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2692916.2555254", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Nov 26 16:26:30 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "Parallel programs consist of series of code sections with different thread-level parallelism (TLP). As a result, it is rather common that a thread in a parallel program, such as a GPU kernel in CUDA programs, still contains both sequential code and parallel loops. In order to leverage such parallel loops, the latest Nvidia Kepler architecture introduces dynamic parallelism, which allows a GPU thread to start another GPU kernel, thereby reducing the overhead of launching kernels from a CPU. However, with dynamic parallelism, a parent thread can only communicate with its child threads through global memory and the overhead of launching GPU kernels is non-trivial even within GPUs. In this paper, we first study a set of GPGPU benchmarks that contain parallel loops, and highlight that these bench-marks do not have a very high loop count or high degrees of TLP. Consequently, the benefits of leveraging such parallel loops using dynamic parallelism are too limited to offset its overhead. We then present our proposed solution to exploit nested parallelism in CUDA, referred to as CUDA-NP. With CUDA-NP, we initially enable a high number of threads when a GPU program starts, and use control flow to activate different numbers of threads for different code sections. We implemented our proposed CUDA-NP framework using a directive-based compiler approach. For a GPU kernel, an application developer only needs to add OpenMP-like pragmas for parallelizable code sections. Then, our CUDA-NP compiler automatically generates the optimized GPU kernels. It supports both the reduction and the scan primitives, explores different ways to distribute parallel loop iterations into threads, and efficiently manages on-chip resource. Our experiments show that for a set of GPGPU benchmarks, which have already been optimized and contain nested parallelism, our pro-posed CUDA-NP framework further improves the performance by up to 6.69 times and 2.18 times on average.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "PPoPP '14 conference proceedings.", } @Article{Yang:2014:MPP, author = "Junfeng Yang and Heming Cui and Jingyue Wu and Yang Tang and Gang Hu", title = "Making parallel programs reliable with stable multithreading", journal = j-CACM, volume = "57", number = "3", pages = "58--69", month = mar, year = "2014", CODEN = "CACMA2", DOI = "https://doi.org/10.1145/2500875", ISSN = "0001-0782 (print), 1557-7317 (electronic)", ISSN-L = "0001-0782", bibdate = "Thu Feb 27 17:17:45 MST 2014", bibsource = "http://www.acm.org/pubs/contents/journals/cacm/; https://www.math.utah.edu/pub/tex/bib/cacm2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Stable multithreading dramatically simplifies the interleaving behaviors of parallel programs, offering new hope for making parallel programming easier.", acknowledgement = ack-nhfb, fjournal = "Communications of the ACM", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J79", } @Article{Aliaga:2015:CMS, author = "Jos{\'e} I. Aliaga and Jos{\'e} M. Bad{\'\i}a and Maribel Castillo and Davor Davidovi{\'c} and Rafael Mayo and Enrique S. Quintana-Ort{\'\i}", title = "Out-of-core macromolecular simulations on multithreaded architectures", journal = j-CCPE, volume = "27", number = "6", pages = "1540--1550", day = "25", month = apr, year = "2015", CODEN = "CCPEBO", DOI = "https://doi.org/10.1002/cpe.3357", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Sat Jul 25 19:54:07 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/ccpe.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Concurrency and Computation: Practice and Experience", journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626", onlinedate = "31 Aug 2014", } @Article{Aliaga:2015:UPE, author = "Jos{\'e} I. Aliaga and Hartwig Anzt and Maribel Castillo and Juan C. Fern{\'a}ndez and Germ{\'a}n Le{\'o}n and Joaqu{\'\i}n P{\'e}rez and Enrique S. Quintana-Ort{\'\i}", title = "Unveiling the performance-energy trade-off in iterative linear system solvers for multithreaded processors", journal = j-CCPE, volume = "27", number = "4", pages = "885--904", day = "25", month = mar, year = "2015", CODEN = "CCPEBO", DOI = "https://doi.org/10.1002/cpe.3341", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Sat Jul 25 19:54:06 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/ccpe.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Concurrency and Computation: Practice and Experience", journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626", onlinedate = "9 Sep 2014", } @Article{Amer:2015:MRC, author = "Abdelhalim Amer and Huiwei Lu and Yanjie Wei and Pavan Balaji and Satoshi Matsuoka", title = "{MPI+Threads}: runtime contention and remedies", journal = j-SIGPLAN, volume = "50", number = "8", pages = "239--248", month = aug, year = "2015", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2858788.2688522", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Feb 16 12:01:42 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/pvm.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "Hybrid MPI+Threads programming has emerged as an alternative model to the ``MPI everywhere'' model to better handle the increasing core density in cluster nodes. While the MPI standard allows multithreaded concurrent communication, such flexibility comes with the cost of maintaining thread safety within the MPI implementation, typically implemented using critical sections. In contrast to previous works that studied the importance of critical-section granularity in MPI implementations, in this paper we investigate the implication of critical-section arbitration on communication performance. We first analyze the MPI runtime when multithreaded concurrent communication takes place on hierarchical memory systems. Our results indicate that the mutex-based approach that most MPI implementations use today can incur performance penalties due to unfair arbitration. We then present methods to mitigate these penalties with a first-come, first-served arbitration and a priority locking scheme that favors threads doing useful work. Through evaluations using several benchmarks and applications, we demonstrate up to 5-fold improvement in performance.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "PPoPP '15 conference proceedings.", } @Article{Axnix:2015:IZF, author = "C. Axnix and G. Bayer and H. Bohm and J. von Buttlar and M. S. Farrell and L. C. Heller and J. P. Kubala and S. E. Lederer and R. Mansell and A. Nunez Mencias and S. Usenbinz", title = "{IBM z13} firmware innovations for simultaneous multithreading and {I/O} virtualization", journal = j-IBM-JRD, volume = "59", number = "??", pages = "11:1--11:11", month = "????", year = "2015", CODEN = "IBMJAE", ISSN = "0018-8646 (print), 2151-8556 (electronic)", ISSN-L = "0018-8646", bibdate = "Wed Oct 21 11:38:12 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/ibmjrd.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", acknowledgement = ack-nhfb, } @Article{Bai:2015:SPA, author = "Xiuxiu Bai and Endong Wang and Xiaoshe Dong and Xingjun Zhang", title = "A scalability prediction approach for multi-threaded applications on manycore processors", journal = j-J-SUPERCOMPUTING, volume = "71", number = "11", pages = "4072--4094", month = nov, year = "2015", CODEN = "JOSUED", DOI = "https://doi.org/10.1007/s11227-015-1505-x", ISSN = "0920-8542 (print), 1573-0484 (electronic)", ISSN-L = "0920-8542", bibdate = "Mon Jan 25 08:18:10 MST 2016", bibsource = "http://link.springer.com/journal/11227/71/11; https://www.math.utah.edu/pub/tex/bib/jsuper.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer.com/article/10.1007/s11227-015-1505-x", acknowledgement = ack-nhfb, fjournal = "The Journal of Supercomputing", journal-URL = "http://link.springer.com/journal/11227", } @Article{Bhatotia:2015:ITL, author = "Pramod Bhatotia and Pedro Fonseca and Umut A. Acar and Bj{\"o}rn B. Brandenburg and Rodrigo Rodrigues", title = "{iThreads}: a Threading Library for Parallel Incremental Computation", journal = j-SIGPLAN, volume = "50", number = "4", pages = "645--659", month = apr, year = "2015", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2775054.2694371", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue May 12 17:41:19 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "Incremental computation strives for efficient successive runs of applications by re-executing only those parts of the computation that are affected by a given input change instead of recomputing everything from scratch. To realize these benefits automatically, we describe iThreads, a threading library for parallel incremental computation. iThreads supports unmodified shared-memory multithreaded programs: it can be used as a replacement for pthreads by a simple exchange of dynamically linked libraries, without even recompiling the application code. To enable such an interface, we designed algorithms and an implementation to operate at the compiled binary code level by leveraging MMU-assisted memory access tracking and process-based thread isolation. Our evaluation on a multicore platform using applications from the PARSEC and Phoenix benchmarks and two case-studies shows significant performance gains.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "ASPLOS '15 conference proceedings.", } @Article{Bogdanas:2015:KJC, author = "Denis Bogdanas and Grigore Rosu", title = "{K-Java}: a Complete Semantics of {Java}", journal = j-SIGPLAN, volume = "50", number = "1", pages = "445--456", month = jan, year = "2015", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2775051.2676982", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue May 12 17:41:19 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "This paper presents K-Java, a complete executable formal semantics of Java 1.4. K-Java was extensively tested with a test suite developed alongside the project, following the Test Driven Development methodology. In order to maintain clarity while handling the great size of Java, the semantics was split into two separate definitions --- a static semantics and a dynamic semantics. The output of the static semantics is a preprocessed Java program, which is passed as input to the dynamic semantics for execution. The preprocessed program is a valid Java program, which uses a subset of the features of Java. The semantics is applied to model-check multi-threaded programs. Both the test suite and the static semantics are generic and ready to be used in other Java-related projects.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "POPL '15 conference proceedings.", } @Article{Cai:2015:ADB, author = "Yan Cai and Changjiang Jia and Shangru Wu and Ke Zhai and Wing Kwong Chan", title = "{ASN}: A Dynamic Barrier-Based Approach to Confirmation of Deadlocks from Warnings for Large-Scale Multithreaded Programs", journal = j-IEEE-TRANS-PAR-DIST-SYS, volume = "26", number = "1", pages = "13--23", month = jan, year = "2015", CODEN = "ITDSEO", ISSN = "1045-9219 (print), 1558-2183 (electronic)", ISSN-L = "1045-9219", bibdate = "Thu Feb 12 13:58:35 MST 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.computer.org/csdl/trans/td/2015/01/06747310-abs.html", abstract-URL = "http://www.computer.org/csdl/trans/td/2015/01/06747310-abs.html", acknowledgement = ack-nhfb, journal-URL = "http://www.computer.org/tpds/archives.htm", } @Article{Chlipala:2015:NIM, author = "Adam Chlipala", title = "From Network Interface to Multithreaded {Web} Applications: a Case Study in Modular Program Verification", journal = j-SIGPLAN, volume = "50", number = "1", pages = "609--622", month = jan, year = "2015", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2775051.2677003", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue May 12 17:41:19 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "Many verifications of realistic software systems are monolithic, in the sense that they define single global invariants over complete system state. More modular proof techniques promise to support reuse of component proofs and even reduce the effort required to verify one concrete system, just as modularity simplifies standard software development. This paper reports on one case study applying modular proof techniques in the Coq proof assistant. To our knowledge, it is the first modular verification certifying a system that combines infrastructure with an application of interest to end users. We assume a nonblocking API for managing TCP networking streams, and on top of that we work our way up to certifying multithreaded, database-backed Web applications. Key verified components include a cooperative threading library and an implementation of a domain-specific language for XML processing. We have deployed our case-study system on mobile robots, where it interfaces with off-the-shelf components for sensing, actuation, and control.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "POPL '15 conference proceedings.", } @Article{Chlipala:2015:UWS, author = "Adam Chlipala", title = "{Ur\slash Web}: a Simple Model for Programming the {Web}", journal = j-SIGPLAN, volume = "50", number = "1", pages = "153--165", month = jan, year = "2015", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2775051.2677004", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue May 12 17:41:19 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "The World Wide Web has evolved gradually from a document delivery platform to an architecture for distributed programming. This largely unplanned evolution is apparent in the set of interconnected languages and protocols that any Web application must manage. This paper presents Ur/Web, a domain-specific, statically typed functional programming language with a much simpler model for programming modern Web applications. Ur/Web's model is unified, where programs in a single programming language are compiled to other ``Web standards'' languages as needed; supports novel kinds of encapsulation of Web-specific state; and exposes simple concurrency, where programmers can reason about distributed, multithreaded applications via a mix of transactions and cooperative preemption. We give a tutorial introduction to the main features of Ur/Web and discuss the language implementation and the production Web applications that use it.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "POPL '15 conference proceedings.", } @Article{Curran:2015:IZM, author = "B. W. Curran and C. Jacobi and J. J. Bonanno and D. A. Schroter and K. J. Alexander and A. Puranik and M. M. Helms", title = "The {IBM z13} multithreaded microprocessor", journal = j-IBM-JRD, volume = "59", number = "4--5", pages = "1:1--1:13", month = jul # "\slash " # sep, year = "2015", CODEN = "IBMJAE", ISSN = "0018-8646 (print), 2151-8556 (electronic)", ISSN-L = "0018-8646", bibdate = "Wed Oct 21 11:38:12 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/ibmjrd.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IBM Journal of Research and Development", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5288520", } @Article{Das:2015:SBP, author = "Madan Das and Gabriel Southern and Jose Renau", title = "Section-Based Program Analysis to Reduce Overhead of Detecting Unsynchronized Thread Communication", journal = j-TACO, volume = "12", number = "2", pages = "23:1--23:??", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2766451", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Aug 7 09:46:00 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Most systems that test and verify parallel programs, such as deterministic execution engines, data race detectors, and software transactional memory systems, require instrumenting loads and stores in an application. This can cause a very significant runtime and memory overhead compared to executing uninstrumented code. Multithreaded programming typically allows any thread to perform loads and stores to any location in the process's address space independently, and such tools monitor all these memory accesses. However, many of the addresses in these unsynchronized memory accesses are only used by a single thread and do not affect other executing threads. We propose Section-Based Program Analysis (SBPA), a novel way to decompose the program into disjoint code sections to identify and eliminate instrumenting such loads and stores during program compilation so that the program runtime overhead is significantly reduced. Our analysis includes improvements to pointer analysis and uses a few user directives to increase the effectiveness of SBPA further. We implemented SBPA for a deterministic execution runtime environment and were able to eliminate 51\% of dynamic memory access instrumentations. When combined with directives, such reduction increased to 63\%. We also integrated SBPA with ThreadSanitizer, a state-of-the-art dynamic race detector, and achieved a speedup of 2.43 (2.74 with directives) on a geometric mean basis.", acknowledgement = ack-nhfb, articleno = "23", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924", } @Article{Ding:2015:OCA, author = "Wei Ding and Xulong Tang and Mahmut Kandemir and Yuanrui Zhang and Emre Kultursay", title = "Optimizing off-chip accesses in multicores", journal = j-SIGPLAN, volume = "50", number = "6", pages = "131--142", month = jun, year = "2015", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2813885.2737989", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Feb 16 12:01:41 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "In a network-on-chip (NoC) based manycore architecture, an off-chip data access (main memory access) needs to travel through the on-chip network, spending considerable amount of time within the chip (in addition to the memory access latency). In addition, it contends with on-chip (cache) accesses as both use the same NoC resources. In this paper, focusing on data-parallel, multithreaded applications, we propose a compiler-based off-chip data access localization strategy, which places data elements in the memory space such that an off-chip access traverses a minimum number of links (hops) to reach the memory controller that handles this access. This brings three main benefits. First, the network latency of off-chip accesses gets reduced; second, the network latency of on-chip accesses gets reduced; and finally, the memory latency of off-chip accesses improves, due to reduced queue latencies. We present an experimental evaluation of our optimization strategy using a set of 13 multithreaded application programs under both private and shared last-level caches. The results collected emphasize the importance of optimizing the off-chip data accesses.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "PLDI '15 conference proceedings.", } @Article{Fang:2015:MMD, author = "Zhenman Fang and Sanyam Mehta and Pen-Chung Yew and Antonia Zhai and James Greensky and Gautham Beeraka and Binyu Zang", title = "Measuring Microarchitectural Details of Multi- and Many-Core Memory Systems through Microbenchmarking", journal = j-TACO, volume = "11", number = "4", pages = "55:1--55:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2687356", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jan 12 11:38:56 MST 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "As multicore and many-core architectures evolve, their memory systems are becoming increasingly more complex. To bridge the latency and bandwidth gap between the processor and memory, they often use a mix of multilevel private/shared caches that are either blocking or nonblocking and are connected by high-speed network-on-chip. Moreover, they also incorporate hardware and software prefetching and simultaneous multithreading (SMT) to hide memory latency. On such multi- and many-core systems, to incorporate various memory optimization schemes using compiler optimizations and performance tuning techniques, it is crucial to have microarchitectural details of the target memory system. Unfortunately, such details are often unavailable from vendors, especially for newly released processors. In this article, we propose a novel microbenchmarking methodology based on short elapsed-time events (SETEs) to obtain comprehensive memory microarchitectural details in multi- and many-core processors. This approach requires detailed analysis of potential interfering factors that could affect the intended behavior of such memory systems. We lay out effective guidelines to control and mitigate those interfering factors. Taking the impact of SMT into consideration, our proposed methodology not only can measure traditional cache/memory latency and off-chip bandwidth but also can uncover the details of software and hardware prefetching units not attempted in previous studies. Using the newly released Intel Xeon Phi many-core processor (with in-order cores) as an example, we show how we can use a set of microbenchmarks to determine various microarchitectural features of its memory system (many are undocumented from vendors). To demonstrate the portability and validate the correctness of such a methodology, we use the well-documented Intel Sandy Bridge multicore processor (with out-of-order cores) as another example, where most data are available and can be validated. Moreover, to illustrate the usefulness of the measured data, we do a multistage coordinated data prefetching case study on both Xeon Phi and Sandy Bridge and show that by using the measured data, we can achieve 1.3X and 1.08X performance speedup, respectively, compared to the state-of-the-art Intel ICC compiler. We believe that these measurements also provide useful insights into memory optimization, analysis, and modeling of such multicore and many-core architectures.", acknowledgement = ack-nhfb, articleno = "55", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924", } @Article{Farzan:2015:PSU, author = "Azadeh Farzan and Zachary Kincaid and Andreas Podelski", title = "Proof Spaces for Unbounded Parallelism", journal = j-SIGPLAN, volume = "50", number = "1", pages = "407--420", month = jan, year = "2015", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2775051.2677012", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue May 12 17:41:19 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "In this paper, we present a new approach to automatically verify multi-threaded programs which are executed by an unbounded number of threads running in parallel. The starting point for our work is the problem of how we can leverage existing automated verification technology for sequential programs (abstract interpretation, Craig interpolation, constraint solving, etc.) for multi-threaded programs. Suppose that we are given a correctness proof for a trace of a program (or for some other program fragment). We observe that the proof can always be decomposed into a finite set of Hoare triples, and we ask what can be proved from the finite set of Hoare triples using only simple combinatorial inference rules (without access to a theorem prover and without the possibility to infer genuinely new Hoare triples)? We introduce a proof system where one proves the correctness of a multi-threaded program by showing that for each trace of the program, there exists a correctness proof in the space of proofs that are derivable from a finite set of axioms using simple combinatorial inference rules. This proof system is complete with respect to the classical proof method of establishing an inductive invariant (which uses thread quantification and control predicates). Moreover, it is possible to algorithmically check whether a given set of axioms is sufficient to prove the correctness of a multi-threaded program, using ideas from well-structured transition systems.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "POPL '15 conference proceedings.", } @Article{Ghosh:2015:NCC, author = "Ammlan Ghosh and Rituparna Chaki and Nabendu Chaki", title = "A new concurrency control mechanism for multi-threaded environment using transactional memory", journal = j-J-SUPERCOMPUTING, volume = "71", number = "11", pages = "4095--4115", month = nov, year = "2015", CODEN = "JOSUED", DOI = "https://doi.org/10.1007/s11227-015-1507-8", ISSN = "0920-8542 (print), 1573-0484 (electronic)", ISSN-L = "0920-8542", bibdate = "Mon Jan 25 08:18:10 MST 2016", bibsource = "http://link.springer.com/journal/11227/71/11; https://www.math.utah.edu/pub/tex/bib/jsuper.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer.com/article/10.1007/s11227-015-1507-8; http://link.springer.com/content/pdf/10.1007/s11227-015-1507-8.pdf", acknowledgement = ack-nhfb, fjournal = "The Journal of Supercomputing", journal-URL = "http://link.springer.com/journal/11227", } @Article{Halappanavar:2015:CLL, author = "Mahantesh Halappanavar and Alex Pothen and Ariful Azad and Fredrik Manne and Johannes Langguth and Arif Khan", title = "Codesign Lessons Learned from Implementing Graph Matching on Multithreaded Architectures", journal = j-COMPUTER, volume = "48", number = "8", pages = "46--55", month = aug, year = "2015", CODEN = "CPTRB4", DOI = "https://doi.org/10.1109/MC.2015.215", ISSN = "0018-9162 (print), 1558-0814 (electronic)", ISSN-L = "0018-9162", bibdate = "Tue Nov 3 07:04:37 MST 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/computer2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://csdl.computer.org/csdl/mags/co/2015/08/mco2015080046-abs.html", abstract-URL = "http://csdl.computer.org/csdl/mags/co/2015/08/mco2015080046-abs.html", acknowledgement = ack-nhfb, journal-URL = "http://www.computer.org/portal/web/csdl/magazines/computer", } @Article{Hottelier:2015:SLE, author = "Thibaud Hottelier and Rastislav Bodik", title = "Synthesis of layout engines from relational constraints", journal = j-SIGPLAN, volume = "50", number = "10", pages = "74--88", month = oct, year = "2015", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2858965.2814291", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Feb 16 12:01:43 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "We present an algorithm for synthesizing efficient document layout engines from compact relational specifications. These specifications are compact in that a single specification can produce multiple engines, each for a distinct layout situation, i.e., a different combination of known vs. unknown attributes. Technically, our specifications are relational attribute grammars, while our engines are functional attribute grammars. By synthesizing functions from relational constraints, we obviate the need for constraint solving at runtime, because functional attribute grammars can be easily evaluated according to a fixed schedule, sidestepping the backtracking search performed by constraint solvers. Our experiments show that we can generate layout engines for non-trivial data visualizations, and that our synthesized engines are between 39- and 200-times faster than general-purpose constraint solvers. Relational specifications of layout give rise to synthesis problems that have previously proved intractable. Our algorithm exploits the hierarchical, grammar-based structure of the specification, decomposing the specification into smaller subproblems, which can be tackled with off-the-shelf synthesis procedures. The new synthesis problem then becomes the composition of the functions thus generated into a correct attribute grammar, which might be recursive. We show how to solve this problem by efficient reduction to an SMT problem.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "OOPSLA '15 conference proceedings.", } @Article{Huang:2015:COM, author = "Kai Huang and Min Yu and Rongjie Yan and Xiaomeng Zhang and Xiaolang Yan and Lisane Brisolara and Ahmed Amine Jerraya and Jiong Feng", title = "Communication Optimizations for Multithreaded Code Generation from {Simulink} Models", journal = j-TECS, volume = "14", number = "3", pages = "59:1--59:??", month = may, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2644811", ISSN = "1539-9087 (print), 1558-3465 (electronic)", ISSN-L = "1539-9087", bibdate = "Sat Dec 9 08:08:56 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/tecs.bib", abstract = "Communication frequency is increasing with the growing complexity of emerging embedded applications and the number of processors in the implemented multiprocessor SoC architectures. In this article, we consider the issue of communication cost reduction during multithreaded code generation from partitioned Simulink models to help designers in code optimization to improve system performance. We first propose a technique combining message aggregation and communication pipeline methods, which groups communications with the same destinations and sources and parallelizes communication and computation tasks. We also present a method to apply static analysis and dynamic emulation for efficient communication buffer allocation to further reduce synchronization cost and increase processor utilization. The existing cyclic dependency in the mapped model may hinder the effectiveness of the two techniques. We further propose a set of optimizations involving repartition with strongly connected threads to maximize the degree of communication reduction and preprocessing strategies with available delays in the model to reduce the number of communication channels that cannot be optimized. Experimental results demonstrate the advantages of the proposed optimizations with 11--143\% throughput improvement.", acknowledgement = ack-nhfb, articleno = "59", fjournal = "ACM Transactions on Embedded Computing Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J840", } @Article{Hussein:2015:DRM, author = "Ahmed Hussein and Antony L. Hosking and Mathias Payer and Christopher A. Vick", title = "Don't race the memory bus: taming the {GC} leadfoot", journal = j-SIGPLAN, volume = "50", number = "11", pages = "15--27", month = nov, year = "2015", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2887746.2754182", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Feb 16 12:01:44 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "Dynamic voltage and frequency scaling (DVFS) is ubiquitous on mobile devices as a mechanism for saving energy. Reducing the clock frequency of a processor allows a corresponding reduction in power consumption, as does turning off idle cores. Garbage collection is a canonical example of the sort of memory-bound workload that best responds to such scaling. Here, we explore the impact of frequency scaling for garbage collection in a real mobile device running Android's Dalvik virtual machine, which uses a concurrent collector. By controlling the frequency of the core on which the concurrent collector thread runs we can reduce power significantly. Running established multi-threaded benchmarks shows that total processor energy can be reduced up to 30\%, with end-to-end performance loss of at most 10\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "ISMM '15 conference proceedings.", } @Article{Jeon:2015:MTH, author = "Yongkweon Jeon and Sungroh Yoon", title = "Multi-Threaded Hierarchical Clustering by Parallel Nearest-Neighbor Chaining", journal = j-IEEE-TRANS-PAR-DIST-SYS, volume = "26", number = "9", pages = "2534--2548", month = sep, year = "2015", CODEN = "ITDSEO", ISSN = "1045-9219 (print), 1558-2183 (electronic)", ISSN-L = "1045-9219", bibdate = "Mon Sep 28 12:20:25 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.computer.org/csdl/trans/td/2015/09/06893001.pdf", abstract-URL = "http://www.computer.org/csdl/trans/td/2015/09/06893001-abs.html", acknowledgement = ack-nhfb, journal-URL = "http://www.computer.org/tpds/archives.htm", } @Article{Kandemir:2015:MRR, author = "Mahmut Kandemir and Hui Zhao and Xulong Tang and Mustafa Karakoy", title = "Memory Row Reuse Distance and its Role in Optimizing Application Performance", journal = j-SIGMETRICS, volume = "43", number = "1", pages = "137--149", month = jun, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2796314.2745867", ISSN = "0163-5999 (print), 1557-9484 (electronic)", ISSN-L = "0163-5999", bibdate = "Fri Sep 18 06:59:51 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigmetrics.bib", abstract = "Continuously increasing dataset sizes of large-scale applications overwhelm on-chip cache capacities and make the performance of last-level caches (LLC) increasingly important. That is, in addition to maximizing LLC hit rates, it is becoming equally important to reduce LLC miss latencies. One of the critical factors that influence LLC miss latencies is row-buffer locality (i.e., the fraction of LLC misses that hit in the large buffer attached to a memory bank). While there has been a plethora of recent works on optimizing row-buffer performance, to our knowledge, there is no study that quantifies the full potential of row-buffer locality and impact of maximizing it on application performance. Focusing on multithreaded applications, the first contribution of this paper is the definition of a new metric called (memory) row reuse distance (RRD). We show that, while intra-core RRDs are relatively small (increasing the chances for row-buffer hits), inter-core RRDs are quite large (increasing the chances for row-buffer misses). Motivated by this, we propose two schemes that measure the maximum potential benefits that could be obtained from minimizing RRDs, to the extent allowed by program dependencies. Specifically, one of our schemes (Scheme-I) targets only intra-core RRDs, whereas the other one (Scheme-II) aims at reducing both intra-core RRDs and inter-core RRDs. Our experimental evaluations demonstrate that (i) Scheme-I reduces intra-core RRDs but increases inter-core RRDs; (ii) Scheme-II reduces inter-core RRDs significantly while achieving a similar behavior to Scheme-I as far as intra-core RRDs are concerned; (iii) Scheme-I and Scheme-II improve execution times of our applications by 17\% and 21\%, respectively, on average; and (iv) both our schemes deliver consistently good results under different memory request scheduling policies.", acknowledgement = ack-nhfb, fjournal = "ACM SIGMETRICS Performance Evaluation Review", journal-URL = "http://portal.acm.org/toc.cfm?id=J618", } @Article{Kasikci:2015:ACD, author = "Baris Kasikci and Cristian Zamfir and George Candea", title = "Automated Classification of Data Races Under Both Strong and Weak Memory Models", journal = j-TOPLAS, volume = "37", number = "3", pages = "8:1--8:??", month = jun, year = "2015", CODEN = "ATPSDT", DOI = "https://doi.org/10.1145/2734118", ISSN = "0164-0925 (print), 1558-4593 (electronic)", ISSN-L = "0164-0925", bibdate = "Fri Jun 19 05:36:55 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/toplas/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/toplas.bib", abstract = "Data races are one of the main causes of concurrency problems in multithreaded programs. Whether all data races are bad, or some are harmful and others are harmless, is still the subject of vigorous scientific debate [Narayanasamy et al. 2007; Boehm 2012]. What is clear, however, is that today's code has many data races [Kasikci et al. 2012; Jin et al. 2012; Erickson et al. 2010], and fixing data races without introducing bugs is time consuming [Godefroid and Nagappan 2008]. Therefore, it is important to efficiently identify data races in code and understand their consequences to prioritize their resolution. We present Portend$^+$, a tool that not only detects races but also automatically classifies them based on their potential consequences: Could they lead to crashes or hangs? Could their effects be visible outside the program? Do they appear to be harmless? How do their effects change under weak memory models? Our proposed technique achieves high accuracy by efficiently analyzing multiple paths and multiple thread schedules in combination, and by performing symbolic comparison between program outputs. We ran Portend$^+$ on seven real-world applications: it detected 93 true data races and correctly classified 92 of them, with no human effort. Six of them were harmful races. Portend$^+$ 's classification accuracy is up to 89\% higher than that of existing tools, and it produces easy-to-understand evidence of the consequences of ``harmful'' races, thus both proving their harmfulness and making debugging easier. We envision Portend$^+$ being used for testing and debugging, as well as for automatically triaging bug reports.", acknowledgement = ack-nhfb, articleno = "8", fjournal = "ACM Transactions on Programming Languages and Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783", } @Article{Kerrison:2015:EMS, author = "Steve Kerrison and Kerstin Eder", title = "Energy Modeling of Software for a Hardware Multithreaded Embedded Microprocessor", journal = j-TECS, volume = "14", number = "3", pages = "56:1--56:??", month = may, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2700104", ISSN = "1539-9087 (print), 1558-3465 (electronic)", ISSN-L = "1539-9087", bibdate = "Sat Dec 9 08:08:56 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/tecs.bib", abstract = "This article examines a hardware multithreaded microprocessor and discusses the impact such an architecture has on existing software energy modeling techniques. A framework is constructed for analyzing the energy behavior of the XMOS XS1-L multithreaded processor and a variation on existing software energy models is proposed, based on analysis of collected energy data. It is shown that by combining execution statistics with sufficient data on the processor's thread activity and instruction execution costs, a multithreaded software energy model used with Instruction Set Simulation can yield an average error margin of less than 7\%.", acknowledgement = ack-nhfb, articleno = "56", fjournal = "ACM Transactions on Embedded Computing Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J840", } @Article{Kestor:2015:TPD, author = "Gokcen Kestor and Osman S. Unsal and Adrian Cristal and Serdar Tasiran", title = "{TRADE}: Precise Dynamic Race Detection for Scalable Transactional Memory Systems", journal = j-TOPC, volume = "2", number = "2", pages = "11:1--11:??", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2786021", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Fri Aug 7 10:22:35 MDT 2015", bibsource = "http://topc.acm.org/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "As other multithreaded programs, transactional memory (TM) programs are prone to race conditions. Previous work focuses on extending existing definitions of data race for lock-based applications to TM applications, which requires all transactions to be totally ordered ``as if'' serialized by a global lock. This approach poses implementation constraints on the STM that severely limits TM applications' performance. This article shows that forcing total ordering among all running transactions, while sufficient, is not necessary. We introduce an alternative data race definition, relaxed transactional data race, that requires ordering of only conflicting transactions. The advantages of our relaxed definition are twofold: First, unlike the previous definition, this definition can be applied to a wide range of TMs, including those that do not enforce transaction total ordering. Second, within a single execution, it exposes a higher number of data races, which considerably reduces debugging time. Based on this definition, we propose a novel and precise race detection tool for C/C++ TM applications (TRADE), which detects data races by tracking happens-before edges among conflicting transactions. Our experiments reveal that TRADE precisely detects data races for STAMP applications running on modern STMs with overhead comparable to state-of-the-art race detectors for lock-based applications. Our experiments also show that in a single run, TRADE identifies several races not discovered by 10 separate runs of a race detection tool based on the previous data race definition.", acknowledgement = ack-nhfb, articleno = "11", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Kocberber:2015:AMA, author = "Onur Kocberber and Babak Falsafi and Boris Grot", title = "Asynchronous memory access chaining", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "4", pages = "252--263", month = dec, year = "2015", CODEN = "????", ISSN = "2150-8097", bibdate = "Sat Dec 19 17:42:25 MST 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "In-memory databases rely on pointer-intensive data structures to quickly locate data in memory. A single lookup operation in such data structures often exhibits long-latency memory stalls due to dependent pointer dereferences. Hiding the memory latency by launching additional memory accesses for other lookups is an effective way of improving performance of pointer-chasing codes (e.g., hash table probes, tree traversals). The ability to exploit such inter-lookup parallelism is beyond the reach of modern out-of-order cores due to the limited size of their instruction window. Instead, recent work has proposed software prefetching techniques that exploit inter-lookup parallelism by arranging a set of independent lookups into a group or a pipeline, and navigate their respective pointer chains in a synchronized fashion. While these techniques work well for highly regular access patterns, they break down in the face of irregularity across lookups. Such irregularity includes variable-length pointer chains, early exit, and read/write dependencies. This work introduces Asynchronous Memory Access Chaining (AMAC), a new approach for exploiting inter-lookup parallelism to hide the memory access latency. AMAC achieves high dynamism in dealing with irregularity across lookups by maintaining the state of each lookup separately from that of other lookups. This feature enables AMAC to initiate a new lookup as soon as any of the in-flight lookups complete. In contrast, the static arrangement of lookups into a group or pipeline in existing techniques precludes such adaptivity. Our results show that AMAC matches or outperforms state-of-the-art prefetching techniques on regular access patterns, while delivering up to 2.3x higher performance under irregular data structure lookups. AMAC fully utilizes the available microarchitectural resources, generating the maximum number of memory accesses allowed by hardware in both single- and multi-threaded execution modes.", acknowledgement = ack-nhfb, fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Kubica:2015:PHT, author = "Bartlomiej Jacek Kubica", title = "Presentation of a highly tuned multithreaded interval solver for underdetermined and well-determined nonlinear systems", journal = j-NUMER-ALGORITHMS, volume = "70", number = "4", pages = "929--963", month = dec, year = "2015", CODEN = "NUALEG", DOI = "https://doi.org/10.1007/s11075-015-9980-y", ISSN = "1017-1398 (print), 1572-9265 (electronic)", ISSN-L = "1017-1398", bibdate = "Mon Jan 25 08:55:03 MST 2016", bibsource = "http://link.springer.com/journal/11075/70/4; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/numeralgorithms.bib", URL = "http://link.springer.com/article/10.1007/s11075-015-9980-y; http://link.springer.com/content/pdf/10.1007/s11075-015-9980-y.pdf", acknowledgement = ack-nhfb, fjournal = "Numerical Algorithms", journal-URL = "http://link.springer.com/journal/11075", } @Article{Kuszmaul:2015:SSF, author = "Bradley C. Kuszmaul", title = "{SuperMalloc}: a super fast multithreaded {\tt malloc} for 64-bit machines", journal = j-SIGPLAN, volume = "50", number = "11", pages = "41--55", month = nov, year = "2015", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2887746.2754178", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Feb 16 12:01:44 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "SuperMalloc is an implementation of malloc(3) originally designed for X86 Hardware Transactional Memory (HTM)@. It turns out that the same design decisions also make it fast even without HTM@. For the malloc-test benchmark, which is one of the most difficult workloads for an allocator, with one thread SuperMalloc is about 2.1 times faster than the best of DLmalloc, JEmalloc, Hoard, and TBBmalloc; with 8 threads and HTM, SuperMalloc is 2.75 times faster; and on 32 threads without HTM SuperMalloc is 3.4 times faster. SuperMalloc generally compares favorably with the other allocators on speed, scalability, speed variance, memory footprint, and code size. SuperMalloc achieves these performance advantages using less than half as much code as the alternatives. SuperMalloc exploits the fact that although physical memory is always precious, virtual address space on a 64-bit machine is relatively cheap. It allocates 2 chunks which contain objects all the same size. To translate chunk numbers to chunk metadata, SuperMalloc uses a simple array (most of which is uncommitted to physical memory). SuperMalloc takes care to avoid associativity conflicts in the cache: most of the size classes are a prime number of cache lines, and nonaligned huge accesses are randomly aligned within a page. Objects are allocated from the fullest non-full page in the appropriate size class. For each size class, SuperMalloc employs a 10-object per-thread cache, a per-CPU cache that holds about a level-2-cache worth of objects per size class, and a global cache that is organized to allow the movement of many objects between a per-CPU cache and the global cache using $ O(1) $ instructions. SuperMalloc prefetches everything it can before starting a critical section, which makes the critical sections run fast, and for HTM improves the odds that the transaction will commit.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "ISMM '15 conference proceedings.", } @Article{Lai:2015:SAM, author = "Bo-Cheng Charles Lai and Kun-Chun Li and Guan-Ru Li and Chin-Hsuan Chiang", title = "Self adaptable multithreaded object detection on embedded multicore systems", journal = j-J-PAR-DIST-COMP, volume = "78", number = "??", pages = "25--38", month = apr, year = "2015", CODEN = "JPDCER", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Sat Mar 21 09:26:08 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.sciencedirect.com/science/article/pii/S0743731515000192", acknowledgement = ack-nhfb, fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315/", } @Article{Lal:2015:DID, author = "Akash Lal and Shaz Qadeer", title = "{DAG} inlining: a decision procedure for reachability-modulo-theories in hierarchical programs", journal = j-SIGPLAN, volume = "50", number = "6", pages = "280--290", month = jun, year = "2015", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2813885.2737987", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Feb 16 12:01:41 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "A hierarchical program is one with multiple procedures but no loops or recursion. This paper studies the problem of deciding reachability queries in hierarchical programs where individual statements can be encoded in a decidable logic (say in SMT). This problem is fundamental to verification and most directly applicable to doing bounded reachability in programs, i.e., reachability under a bound on the number of loop iterations and recursive calls. The usual method of deciding reachability in hierarchical programs is to first inline all procedures and then do reachability on the resulting single-procedure program. Such inlining unfolds the call graph of the program to a tree and may lead to an exponential increase in the size of the program. We design and evaluate a method called DAG inlining that unfolds the call graph to a directed acyclic graph (DAG) instead of a tree by sharing the bodies of procedures at certain points during inlining. DAG inlining can produce much more compact representations than tree inlining. Empirically, we show that it leads to significant improvements in the running time of a state-of-the-art verifier.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "PLDI '15 conference proceedings.", } @Article{LaSalle:2015:MTM, author = "Dominique LaSalle and George Karypis", title = "Multi-threaded modularity based graph clustering using the multilevel paradigm", journal = j-J-PAR-DIST-COMP, volume = "76", number = "??", pages = "66--80", month = feb, year = "2015", CODEN = "JPDCER", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Mon Mar 9 10:30:03 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.sciencedirect.com/science/article/pii/S0743731514001750", acknowledgement = ack-nhfb, fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315/", } @Article{Lashgar:2015:CSR, author = "Ahmad Lashgar and Ebad Salehi and Amirali Baniasadi", title = "A Case Study in Reverse Engineering {GPGPUs}: Outstanding Memory Handling Resources", journal = j-COMP-ARCH-NEWS, volume = "43", number = "4", pages = "15--21", month = sep, year = "2015", CODEN = "CANED2", DOI = "https://doi.org/10.1145/2927964.2927968", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Apr 22 17:03:53 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "During recent years, GPU micro-architectures have changed dramatically, evolving into powerful many-core deep-multithreaded platforms for parallel workloads. While important micro-architectural modifications continue to appear in every new generation of these processors, unfortunately, little is known about the details of these innovative designs. One of the key questions in understanding GPUs is how they deal with outstanding memory misses. Our goal in this study is to find answers to this question. To this end, we develop a set of micro-benchmarks in CUDA to understand the outstanding memory requests handling resources. Particularly, we study two NVIDIA GPGPUs (Fermi and Kepler) and estimate their capability in handling outstanding memory requests. We show that Kepler can issue nearly 32X higher number of outstanding memory requests, compared to Fermi. We explain this enhancement by Kepler's architectural modifications in outstanding memory request handling resources.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", remark = "HEART '15 conference proceedings.", } @Article{Liu:2015:LRT, author = "Peng Liu and Xiangyu Zhang and Omer Tripp and Yunhui Zheng", title = "{Light}: replay via tightly bounded recording", journal = j-SIGPLAN, volume = "50", number = "6", pages = "55--64", month = jun, year = "2015", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2813885.2738001", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Feb 16 12:01:41 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "Reproducing concurrency bugs is a prominent challenge. Existing techniques either rely on recording very fine grained execution information and hence have high runtime overhead, or strive to log as little information as possible but provide no guarantee in reproducing a bug. We present Light, a technique that features much lower overhead compared to techniques based on fine grained recording, and that guarantees to reproduce concurrent bugs. We leverage and formally prove that recording flow dependences is the necessary and sufficient condition to reproduce a concurrent bug. The flow dependences, together with the thread local orders that can be automatically inferred (and hence not logged), are encoded as scheduling constraints. An SMT solver is used to derive a replay schedule, which is guaranteed to exist even though it may be different from the original schedule. Our experiments show that Light has only 44\% logging overhead, almost one order of magnitude lower than the state of the art techniques relying on logging memory accesses. Its space overhead is only 10\% of those techniques. Light can also reproduce all the bugs we have collected whereas existing techniques miss some of them.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "PLDI '15 conference proceedings.", } @Article{Machado:2015:CDD, author = "Nuno Machado and Brandon Lucia and Lu{\'\i}s Rodrigues", title = "Concurrency debugging with differential schedule projections", journal = j-SIGPLAN, volume = "50", number = "6", pages = "586--595", month = jun, year = "2015", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2813885.2737973", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Feb 16 12:01:41 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "We present Symbiosis: a concurrency debugging technique based on novel differential schedule projections (DSPs). A DSP shows the small set of memory operations and data-flows responsible for a failure, as well as a reordering of those elements that avoids the failure. To build a DSP, Symbiosis first generates a full, failing, multithreaded schedule via thread path profiling and symbolic constraint solving. Symbiosis selectively reorders events in the failing schedule to produce a non-failing, alternate schedule. A DSP reports the ordering and data-flow differences between the failing and non-failing schedules. Our evaluation on buggy real-world software and benchmarks shows that, in practical time, Symbiosis generates DSPs that both isolate the small fraction of event orders and data-flows responsible for the failure, and show which event reorderings prevent failing. In our experiments, DSPs contain 81\% fewer events and 96\% less data-flows than the full failure-inducing schedules. Moreover, by allowing developers to focus on only a few events, DSPs reduce the amount of time required to find a valid fix.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "PLDI '15 conference proceedings.", } @Article{Makreshanski:2015:LSE, author = "Darko Makreshanski and Justin Levandoski and Ryan Stutsman", title = "To lock, swap, or elide: on the interplay of hardware transactional memory and lock-free indexing", journal = j-PROC-VLDB-ENDOWMENT, volume = "8", number = "11", pages = "1298--1309", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.14778/2809974.2809990", ISSN = "2150-8097", bibdate = "Thu Jul 30 16:13:08 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The release of hardware transactional memory (HTM) in commodity CPUs has major implications on the design and implementation of main-memory databases, especially on the architecture of high-performance lock-free indexing methods at the core of several of these systems. This paper studies the interplay of HTM and lock-free indexing methods. First, we evaluate whether HTM will obviate the need for crafty lock-free index designs by integrating it in a traditional B-tree architecture. HTM performs well for simple data sets with small fixed-length keys and payloads, but its benefits disappear for more complex scenarios (e.g., larger variable-length keys and payloads), making it unattractive as a general solution for achieving high performance. Second, we explore fundamental differences between HTM-based and lock-free B-tree designs. While lock-freedom entails design complexity and extra mechanism, it has performance advantages in several scenarios, especially high-contention cases where readers proceed uncontested (whereas HTM aborts readers). Finally, we explore the use of HTM as a method to simplify lock-free design. We find that using HTM to implement a multi-word compare-and-swap greatly reduces lock-free programming complexity at the cost of only a 10-15\% performance degradation. Our study uses two state-of-the-art index implementations: a memory-optimized B-tree extended with HTM to provide multi-threaded concurrency and the Bw-tree lock-free B-tree used in several Microsoft production environments.", acknowledgement = ack-nhfb, fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Markovic:2015:TLS, author = "Nikola Markovic and Daniel Nemirovsky and Osman Unsal and Mateo Valero and Adrian Cristal", title = "Thread Lock Section-Aware Scheduling on Asymmetric Single-{ISA} Multi-Core", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "2", pages = "160--163", month = jul # "\slash " # dec, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2014.2357805", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Jun 20 17:18:18 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "As thread level parallelism in applications has continued to expand, so has research in chip multi-core processors. As more and more applications become multi-threaded we expect to find a growing number of threads executing on a machine. As a consequence, the operating system will require increasingly larger amounts of CPU time to schedule these threads efficiently. Instead of perpetuating the trend of performing more complex thread scheduling in the operating system, we propose a scheduling mechanism that can be efficiently implemented in hardware as well. Our approach of identifying multi-threaded application bottlenecks such as thread synchronization sections complements the Fairness-aware Scheduler method. It achieves an average speed up of 11.5 percent (geometric mean) compared to the state-of-the-art Fairness-aware Scheduler.", acknowledgement = ack-nhfb, affiliation = "Markovic, N (Reprint Author), Barcelona Supercomputing Ctr, Barcelona, Spain. Markovic, Nikola; Nemirovsky, Daniel; Unsal, Osman; Valero, Mateo, Barcelona Supercomputing Ctr, Barcelona, Spain. Markovic, Nikola; Nemirovsky, Daniel; Valero, Mateo, Univ Politecn Cataluna, Barcelona, Spain. Cristal, Adrian, Univ Politecn Cataluna, Barcelona Supercomputing Ctr, E-08028 Barcelona, Spain. Cristal, Adrian, Artificial Intelligence Res Inst Spanish Natl Res, Barcelona, Spain.", author-email = "nikola.markovic@bsc.es daniel.nemirovsky@bsc.es osman.unsal@bsc.es mateo.valero@bsc.es adrian.cristal@bsc.es", da = "2019-06-20", doc-delivery-number = "CZ7DC", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Asymmetric chip multiprocessor (ACMP); HW/SW thread scheduling; multi-threaded applications", number-of-cited-references = "17", ORCID-numbers = "UNSAL, OSMAN/0000-0002-0544-9697 Valero, Mateo/0000-0003-2917-2482", research-areas = "Computer Science", researcherid-numbers = "UNSAL, OSMAN/B-9161-2016 Valero, Mateo/L-5709-2014", times-cited = "7", unique-id = "Markovic:2015:TLS", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Matheou:2015:ASD, author = "George Matheou and Paraskevas Evripidou", title = "Architectural Support for Data-Driven Execution", journal = j-TACO, volume = "11", number = "4", pages = "52:1--52:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2686874", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jan 12 11:38:56 MST 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The exponential growth of sequential processors has come to an end, and thus, parallel processing is probably the only way to achieve performance growth. We propose the development of parallel architectures based on data-driven scheduling. Data-driven scheduling enforces only a partial ordering as dictated by the true data dependencies, which is the minimum synchronization possible. This is very beneficial for parallel processing because it enables it to exploit the maximum possible parallelism. We provide architectural support for data-driven execution for the Data-Driven Multithreading (DDM) model. In the past, DDM has been evaluated mostly in the form of virtual machines. The main contribution of this work is the development of a highly efficient hardware support for data-driven execution and its integration into a multicore system with eight cores on a Virtex-6 FPGA. The DDM semantics make barriers and cache coherence unnecessary, which reduces the synchronization latencies significantly and makes the cache simpler. The performance evaluation has shown that the support for data-driven execution is very efficient with negligible overheads. Our prototype can support very small problem sizes (matrix $ 16 \times 16$) and ultra-lightweight threads (block of $ 4 \times 4$) that achieve speedups close to linear. Such results cannot be achieved by software-based systems.", acknowledgement = ack-nhfb, articleno = "52", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924", } @Article{McCartney:2015:SMT, author = "W. P. McCartney and N. Sridhar", title = "Stackless Multi-Threading for Embedded Systems", journal = j-IEEE-TRANS-COMPUT, volume = "64", number = "10", pages = "2940--2952", month = "????", year = "2015", CODEN = "ITCOB4", DOI = "https://doi.org/10.1109/TC.2014.2378256", ISSN = "0018-9340 (print), 1557-9956 (electronic)", ISSN-L = "0018-9340", bibdate = "Tue Oct 13 06:51:51 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Computers", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12", } @Article{Mehta:2015:MTP, author = "Kshitij Mehta and Edgar Gabriel", title = "Multi-Threaded Parallel {I/O} for {OpenMP} Applications", journal = j-INT-J-PARALLEL-PROG, volume = "43", number = "2", pages = "286--309", month = apr, year = "2015", CODEN = "IJPPE5", DOI = "https://doi.org/10.1007/s10766-014-0306-9", ISSN = "0885-7458 (print), 1573-7640 (electronic)", ISSN-L = "0885-7458", bibdate = "Sat Aug 8 12:34:16 MDT 2015", bibsource = "http://link.springer.com/journal/10766/43/2; https://www.math.utah.edu/pub/tex/bib/intjparallelprogram.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer.com/article/10.1007/s10766-014-0306-9", acknowledgement = ack-nhfb, fjournal = "International Journal of Parallel Programming", journal-URL = "http://link.springer.com/journal/10766", } @Article{Mitchell:2015:GIA, author = "Nathan Mitchell and Court Cutting and Eftychios Sifakis", title = "{GRIDiron}: an interactive authoring and cognitive training foundation for reconstructive plastic surgery procedures", journal = j-TOG, volume = "34", number = "4", pages = "43:1--43:??", month = aug, year = "2015", CODEN = "ATGRDF", DOI = "https://doi.org/10.1145/2766918", ISSN = "0730-0301 (print), 1557-7368 (electronic)", ISSN-L = "0730-0301", bibdate = "Tue Jul 28 17:22:44 MDT 2015", bibsource = "http://www.acm.org/pubs/contents/journals/tog/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/tog.bib", abstract = "We present an interactive simulation framework for authoring surgical procedures of soft tissue manipulation using physics-based simulation to animate the flesh. This interactive authoring tool can be used by clinical educators to craft three-dimensional illustrations of the intricate maneuvers involved in craniofacial repairs, in contrast to two-dimensional sketches and still photographs which are the medium used to describe these procedures in the traditional surgical curriculum. Our virtual environment also allows surgeons-in-training to develop cognitive skills for craniofacial surgery by experimenting with different approaches to reconstructive challenges, adapting stock techniques to flesh regions with nonstandard shape, and reach preliminary predictions about the feasibility of a given repair plan. We use a Cartesian grid-based embedded discretization of nonlinear elasticity to maximize regularity, and expose opportunities for aggressive multithreading and SIMD accelerations. Using a grid-based approach facilitates performance and scalability, but constrains our ability to capture the topology of thin surgical incisions. We circumvent this restriction by hybridizing the grid-based discretization with an explicit hexahedral mesh representation in regions where the embedding mesh necessitates overlap or nonmanifold connectivity. Finally, we detail how the front-end of our system can run on lightweight clients, while the core simulation capability can be hosted on a dedicated server and delivered as a network service.", acknowledgement = ack-nhfb, articleno = "43", fjournal = "ACM Transactions on Graphics", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J778", } @Article{Nelson:2015:RGH, author = "Thomas Nelson and Geoffrey Belter and Jeremy G. Siek and Elizabeth Jessup and Boyana Norris", title = "Reliable Generation of High-Performance Matrix Algebra", journal = j-TOMS, volume = "41", number = "3", pages = "18:1--18:27", month = jun, year = "2015", CODEN = "ACMSCU", DOI = "https://doi.org/10.1145/2629698", ISSN = "0098-3500 (print), 1557-7295 (electronic)", ISSN-L = "0098-3500", bibdate = "Wed Jun 3 17:59:32 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/toms.bib", abstract = "Scientific programmers often turn to vendor-tuned Basic Linear Algebra Subprograms (BLAS) to obtain portable high performance. However, many numerical algorithms require several BLAS calls in sequence, and those successive calls do not achieve optimal performance. The entire sequence needs to be optimized in concert. Instead of vendor-tuned BLAS, a programmer could start with source code in Fortran or C (e.g., based on the Netlib BLAS) and use a state-of-the-art optimizing compiler. However, our experiments show that optimizing compilers often attain only one-quarter of the performance of hand-optimized code. In this article, we present a domain-specific compiler for matrix kernels, the Build to Order BLAS (BTO), that reliably achieves high performance using a scalable search algorithm for choosing the best combination of loop fusion, array contraction, and multithreading for data parallelism. The BTO compiler generates code that is between 16\% slower and 39\% faster than hand-optimized code.", acknowledgement = ack-nhfb, articleno = "18", fjournal = "ACM Transactions on Mathematical Software (TOMS)", journal-URL = "http://dl.acm.org/pub.cfm?id=J782", } @Article{Nguyen:2015:RCC, author = "Ph{\'u}c C. Nguy{\v{e}}n and David {Van Horn}", title = "Relatively complete counterexamples for higher-order programs", journal = j-SIGPLAN, volume = "50", number = "6", pages = "446--456", month = jun, year = "2015", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2813885.2737971", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Feb 16 12:01:41 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "In this paper, we study the problem of generating inputs to a higher-order program causing it to error. We first approach the problem in the setting of PCF, a typed, core functional language and contribute the first relatively complete method for constructing counterexamples for PCF programs. The method is relatively complete with respect to a first-order solver over the base types of PCF. In practice, this means an SMT solver can be used for the effective, automated generation of higher-order counterexamples for a large class of programs. We achieve this result by employing a novel form of symbolic execution for higher-order programs. The remarkable aspect of this symbolic execution is that even though symbolic higher-order inputs and values are considered, the path condition remains a first-order formula. Our handling of symbolic function application enables the reconstruction of higher-order counterexamples from this first-order formula. After establishing our main theoretical results, we sketch how to apply the approach to untyped, higher-order, stateful languages with first-class contracts and show how counterexample generation can be used to detect contract violations in this setting. To validate our approach, we implement a tool generating counterexamples for erroneous modules written in Racket.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "PLDI '15 conference proceedings.", } @Article{Pager:2015:SSM, author = "Jared Pager and Reiley Jeyapaul and Aviral Shrivastava", title = "A Software Scheme for Multithreading on {CGRAs}", journal = j-TECS, volume = "14", number = "1", pages = "19:1--19:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2638558", ISSN = "1539-9087 (print), 1558-3465 (electronic)", ISSN-L = "1539-9087", bibdate = "Thu Jan 22 06:25:23 MST 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/tecs.bib", abstract = "Recent industry trends show a drastic rise in the use of hand-held embedded devices, from everyday applications to medical (e.g., monitoring devices) and critical defense applications (e.g., sensor nodes). The two key requirements in the design of such devices are their processing capabilities and battery life. There is therefore an urgency to build high-performance and power-efficient embedded devices, inspiring researchers to develop novel system designs for the same. The use of a coprocessor (application-specific hardware) to offload power-hungry computations is gaining favor among system designers to suit their power budgets. We propose the use of CGRAs (Coarse-Grained Reconfigurable Arrays) as a power-efficient coprocessor. Though CGRAs have been widely used for streaming applications, the extensive compiler support required limits its applicability and use as a general purpose coprocessor. In addition, a CGRA structure can efficiently execute only one statically scheduled kernel at a time, which is a serious limitation when used as an accelerator to a multithreaded or multitasking processor. In this work, we envision a multithreaded CGRA where multiple schedules (or kernels) can be executed simultaneously on the CGRA (as a coprocessor). We propose a comprehensive software scheme that transforms the traditionally single-threaded CGRA into a multithreaded coprocessor to be used as a power-efficient accelerator for multithreaded embedded processors. Our software scheme includes (1) a compiler framework that integrates with existing CGRA mapping techniques to prepare kernels for execution on the multithreaded CGRA and (2) a runtime mechanism that dynamically schedules multiple kernels (offloaded from the processor) to execute simultaneously on the CGRA coprocessor. Our multithreaded CGRA coprocessor implementation thus makes it possible to achieve improved power-efficient computing in modern multithreaded embedded systems.", acknowledgement = ack-nhfb, articleno = "19", fjournal = "ACM Transactions on Embedded Computing Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J840", } @Article{Perez:2015:ECR, author = "J. F. P{\'e}rez and G. Casale and S. Pacheco-Sanchez", title = "Estimating Computational Requirements in Multi-Threaded Applications", journal = j-IEEE-TRANS-SOFTW-ENG, volume = "41", number = "3", pages = "264--278", month = mar, year = "2015", CODEN = "IESEDJ", DOI = "https://doi.org/10.1109/TSE.2014.2363472", ISSN = "0098-5589 (print), 1939-3520 (electronic)", ISSN-L = "0098-5589", bibdate = "Thu Feb 1 19:49:24 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranssoftweng2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=6926798", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Software Engineering", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=32", } @Article{Porter:2015:MMS, author = "Leo Porter and Michael A. Laurenzano and Ananta Tiwari and Adam Jundt and William A. {Ward, Jr.} and Roy Campbell and Laura Carrington", title = "Making the Most of {SMT} in {HPC}: System- and Application-Level Perspectives", journal = j-TACO, volume = "11", number = "4", pages = "59:1--59:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2687651", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jan 12 11:38:56 MST 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "This work presents an end-to-end methodology for quantifying the performance and power benefits of simultaneous multithreading (SMT) for HPC centers and applies this methodology to a production system and workload. Ultimately, SMT's value system-wide depends on whether users effectively employ SMT at the application level. However, predicting SMT's benefit for HPC applications is challenging; by doubling the number of threads, the application's characteristics may change. This work proposes statistical modeling techniques to predict the speedup SMT confers to HPC applications. This approach, accurate to within 8\%, uses only lightweight, transparent performance monitors collected during a single run of the application.", acknowledgement = ack-nhfb, articleno = "59", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924", } @Article{Porter:2015:PFG, author = "Donald E. Porter and Michael D. Bond and Indrajit Roy and Kathryn S. Mckinley and Emmett Witchel", title = "Practical Fine-Grained Information Flow Control Using {Laminar}", journal = j-TOPLAS, volume = "37", number = "1", pages = "4:1--4:??", month = jan, year = "2015", CODEN = "ATPSDT", DOI = "https://doi.org/10.1145/2638548", ISSN = "0164-0925 (print), 1558-4593 (electronic)", ISSN-L = "0164-0925", bibdate = "Wed Jan 21 07:13:17 MST 2015", bibsource = "http://www.acm.org/pubs/contents/journals/toplas/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/toplas.bib", abstract = "Decentralized Information Flow Control (DIFC) is a promising model for writing programs with powerful, end-to-end security guarantees. Current DIFC systems that run on commodity hardware can be broadly categorized into two types: language-level and operating system-level DIFC. Language solutions provide no guarantees against security violations on system resources such as files and sockets. Operating system solutions mediate accesses to system resources but are either inefficient or imprecise at monitoring the flow of information through fine-grained program data structures. This article describes Laminar, the first system to implement DIFC using a unified set of abstractions for OS resources and heap-allocated objects. Programmers express security policies by labeling data with secrecy and integrity labels and access the labeled data in security methods. Laminar enforces the security policies specified by the labels at runtime. Laminar is implemented using a modified Java virtual machine and a new Linux security module. This article shows that security methods ease incremental deployment and limit dynamic security checks by retrofitting DIFC policies on four application case studies. Replacing the applications' ad hoc security policies changes less than 10\% of the code and incurs performance overheads from 5\% to 56\%. Compared to prior DIFC systems, Laminar supports a more general class of multithreaded DIFC programs efficiently and integrates language and OS abstractions.", acknowledgement = ack-nhfb, articleno = "4", fjournal = "ACM Transactions on Programming Languages and Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783", } @Article{Rodrigues:2015:DSE, author = "Rance Rodrigues and Israel Koren and Sandip Kundu", title = "Does the Sharing of Execution Units Improve Performance\slash Power of Multicores?", journal = j-TECS, volume = "14", number = "1", pages = "17:1--17:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2680543", ISSN = "1539-9087 (print), 1558-3465 (electronic)", ISSN-L = "1539-9087", bibdate = "Thu Jan 22 06:25:23 MST 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/tecs.bib", abstract = "Several studies and recent real-world designs have promoted sharing of underutilized resources between cores in a multicore processor to achieve better performance/power. It has been argued that when utilization of such resources is low, sharing has a negligible impact on performance while offering considerable area and power benefits. In this article, we investigate the performance and performance/watt implications of sharing large and underutilized resources between pairs of cores in a multicore. We first study sharing of the entire floating-point datapath (including reservation stations and execution units) by two cores, similar to AMD's Bulldozer. We find that while this architecture results in power savings for certain workload combinations, it also results in significant performance loss of up to 28\%. Next, we study an alternative sharing architecture where only the floating-point execution units are shared, while the individual cores retain their reservation stations. This reduces the highest performance loss to 14\%. We then extend the study to include sharing of other large execution units that are used infrequently, namely, the integer multiply and divide units. Subsequently, we analyze the impact of sharing hardware resources in Simultaneously Multithreaded (SMT) processors where multiple threads run concurrently on the same core. We observe that sharing improves performance/watt at a negligible performance cost only if the shared units have high throughput. Sharing low-throughput units reduces both performance and performance/watt. To increase the throughput of the shared units, we propose the use of Dynamic Voltage and Frequency Boosting (DVFB) of only the shared units that can be placed on a separate voltage island. Our results indicate that the use of DVFB improves both performance and performance/watt by as much as 22\% and 10\%, respectively.", acknowledgement = ack-nhfb, articleno = "17", fjournal = "ACM Transactions on Embedded Computing Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J840", } @Article{Saillard:2015:SDV, author = "Emmanuelle Saillard and Patrick Carribault and Denis Barthou", title = "Static\slash dynamic validation of {MPI} collective communications in multi-threaded context", journal = j-SIGPLAN, volume = "50", number = "8", pages = "279--280", month = aug, year = "2015", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2858788.2688548", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Feb 16 12:01:42 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/pvm.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "Scientific applications mainly rely on the MPI parallel programming model to reach high performance on supercomputers. The advent of manycore architectures (larger number of cores and lower amount of memory per core) leads to mix MPI with a thread-based model like OpenMP. But integrating two different programming models inside the same application can be tricky and generate complex bugs. Thus, the correctness of hybrid programs requires a special care regarding MPI calls location. For example, identical MPI collective operations cannot be performed by multiple non-synchronized threads. To tackle this issue, this paper proposes a static analysis and a reduced dynamic instrumentation to detect bugs related to misuse of MPI collective operations inside or outside threaded regions. This work extends PARCOACH designed for MPI-only applications and keeps the compatibility with these algorithms. We validated our method on multiple hybrid benchmarks and applications with a low overhead.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "PPoPP '15 conference proceedings.", } @Article{Samak:2015:SRT, author = "Malavika Samak and Murali Krishna Ramanathan and Suresh Jagannathan", title = "Synthesizing racy tests", journal = j-SIGPLAN, volume = "50", number = "6", pages = "175--185", month = jun, year = "2015", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2813885.2737998", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Feb 16 12:01:41 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/java2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "Subtle concurrency errors in multithreaded libraries that arise because of incorrect or inadequate synchronization are often difficult to pinpoint precisely using only static techniques. On the other hand, the effectiveness of dynamic race detectors is critically dependent on multithreaded test suites whose execution can be used to identify and trigger races. Usually, such multithreaded tests need to invoke a specific combination of methods with objects involved in the invocations being shared appropriately to expose a race. Without a priori knowledge of the race, construction of such tests can be challenging. In this paper, we present a lightweight and scalable technique for synthesizing precisely these kinds of tests. Given a multithreaded library and a sequential test suite, we describe a fully automated analysis that examines sequential execution traces, and produces as its output a concurrent client program that drives shared objects via library method calls to states conducive for triggering a race. Experimental results on a variety of well-tested Java libraries yield 101 synthesized multithreaded tests in less than four minutes. Analyzing the execution of these tests using an off-the-shelf race detector reveals 187 harmful races, including several previously unreported ones. Our implementation, named NARADA, and the results of our experiments are available at http://www.csa.iisc.ernet.in/~sss/tools/narada.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "PLDI '15 conference proceedings.", } @Article{Saxena:2015:MAR, author = "Sanjay Saxena and Neeraj Sharma and Shiru Sharma", title = "Multithreaded Approach for Registration of Medical Images using Mutual Information in Multicore Environment and its Applications in Medical Imaging", journal = j-INT-J-COMP-APPL, volume = "113", number = "??", pages = "23--32", month = mar, year = "2015", CODEN = "????", DOI = "https://doi.org/10.5120/19807-1598", ISSN = "0975-8887", ISSN-L = "0975-8887", bibdate = "Fri Jan 24 09:16:05 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/intjcompappl.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "https://www.ijcaonline.org/archives/volume113/number3/19807-1598/", acknowledgement = ack-nhfb, ajournal = "Intern. J. of Computer Applications", articleno = "3", fjournal = "International Journal of Computer Applications", journal-URL = "https://www.ijcaonline.org/", } @Article{Schweitzer:2015:PEM, author = "P. Schweitzer and S. Cipi{\`e}re and A. Dufaure and H. Payno and Y. Perrot and D. R. C. Hill and L. Maigne", title = "Performance Evaluation of Multithreaded {Geant4} Simulations Using an {Intel Xeon Phi} Cluster", journal = j-SCI-PROG, volume = "2015", number = "??", pages = "980752:1--980752:10", month = "????", year = "2015", CODEN = "SCIPEV", DOI = "https://doi.org/10.1155/2015/980752", ISSN = "1058-9244 (print), 1875-919X (electronic)", ISSN-L = "1058-9244", bibdate = "Tue Sep 20 07:53:44 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sciprogram.bib", URL = "https://www.hindawi.com/journals/sp/2015/980752/", acknowledgement = ack-nhfb, fjournal = "Scientific Programming", journal-URL = "https://www.hindawi.com/journals/sp/", journalabr = "Sci. Prog", } @Article{Shi:2015:CLM, author = "Qingchuan Shi and Henry Hoffmann and Omer Khan", title = "A Cross-Layer Multicore Architecture to Tradeoff Program Accuracy and Resilience Overheads", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "2", pages = "85--89", month = jul # "\slash " # dec, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2014.2365204", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Jun 20 17:18:18 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "To protect multicores from soft-error perturbations, resiliency schemes have been developed with high coverage but high power/performance overheads (similar to 2x). We observe that not all soft-errors affect program correctness, some soft-errors only affect program accuracy, i.e., the program completes with certain acceptable deviations from soft-error free outcome. Thus, it is practical to improve processor efficiency by trading off resilience overheads with program accuracy. We propose the idea of declarative resilience that selectively applies resilience schemes to both crucial and non-crucial code, while ensuring program correctness. At the application level, crucial and non-crucial code is identified based on its impact on the program outcome. The hardware collaborates with software support to enable efficient resilience with 100 percent soft-error coverage. Only program accuracy is compromised in the worst-case scenario of a soft-error strike during non-crucial code execution. For a set of multithreaded benchmarks, declarative resilience improves completion time by an average of 21 percent over state-of-the-art hardware resilience scheme that protects all executed code. Its performance overhead is similar to 1.38x over a multicore that does not support resilience.", acknowledgement = ack-nhfb, affiliation = "Shi, QC (Reprint Author), Univ Connecticut, Dept Elect \& Comp Engn, Storrs, CT 06269 USA. Shi, Qingchuan; Khan, Omer, Univ Connecticut, Dept Elect \& Comp Engn, Storrs, CT 06269 USA. Hoffmann, Henry, Univ Chicago, Dept Comp Sci, Chicago, IL 60637 USA.", author-email = "qingchuan.shi@uconn.edu hankhoffmann@cs.uchicago.edu khan@uconn.edu", da = "2019-06-20", doc-delivery-number = "CZ7DC", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "multicores; program accuracy; Resilience; soft-errors", number-of-cited-references = "23", research-areas = "Computer Science", times-cited = "4", unique-id = "Shi:2015:CLM", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Won:2015:MMC, author = "Youjip Won and Kyeongyeol Lim and Jaehong Min", title = "{MUCH}: Multithreaded Content-Based File Chunking", journal = j-IEEE-TRANS-COMPUT, volume = "64", number = "5", pages = "1375--1388", month = "????", year = "2015", CODEN = "ITCOB4", DOI = "https://doi.org/10.1109/TC.2014.2322600", ISSN = "0018-9340 (print), 1557-9956 (electronic)", ISSN-L = "0018-9340", bibdate = "Thu Jun 4 19:46:44 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Computers", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12", } @Article{Zhang:2015:DMB, author = "Junchao Zhang and Babak Behzad and Marc Snir", title = "Design of a Multithreaded {Barnes--Hut} Algorithm for Multicore Clusters", journal = j-IEEE-TRANS-PAR-DIST-SYS, volume = "26", number = "7", pages = "1861--1873", month = jul, year = "2015", CODEN = "ITDSEO", DOI = "https://doi.org/10.1109/TPDS.2014.2331243", ISSN = "1045-9219 (print), 1558-2183 (electronic)", ISSN-L = "1045-9219", bibdate = "Mon Aug 3 11:58:51 MDT 2015", bibsource = "https://www.math.utah.edu/pub/bibnet/subjects/fastmultipole.bib; https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.computer.org/csdl/trans/td/2015/07/06837521-abs.html", abstract-URL = "http://www.computer.org/csdl/trans/td/2015/07/06837521-abs.html", acknowledgement = ack-nhfb, journal-URL = "http://www.computer.org/tpds/archives.htm", } @Article{Zhang:2015:DPO, author = "Naling Zhang and Markus Kusano and Chao Wang", title = "Dynamic partial order reduction for relaxed memory models", journal = j-SIGPLAN, volume = "50", number = "6", pages = "250--259", month = jun, year = "2015", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2813885.2737956", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Feb 16 12:01:41 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "Under a relaxed memory model such as TSO or PSO, a concurrent program running on a shared-memory multiprocessor may observe two types of nondeterminism: the nondeterminism in thread scheduling and the nondeterminism in store buffering. Although there is a large body of work on mitigating the scheduling nondeterminism during runtime verification, methods for soundly mitigating the store buffering nondeterminism are lacking. We propose a new dynamic partial order reduction (POR) algorithm for verifying concurrent programs under TSO and PSO. Our method relies on modeling both types of nondeterminism in a unified framework, which allows us to extend existing POR techniques to TSO and PSO without overhauling the verification algorithm. In addition to sound POR, we also propose a buffer-bounding method for more aggressively reducing the state space. We have implemented our new methods in a stateless model checking tool and demonstrated their effectiveness on a set of multithreaded C benchmarks.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "PLDI '15 conference proceedings.", } @Article{Zhang:2015:LOS, author = "Minjia Zhang and Jipeng Huang and Man Cao and Michael D. Bond", title = "Low-overhead software transactional memory with progress guarantees and strong semantics", journal = j-SIGPLAN, volume = "50", number = "8", pages = "97--108", month = aug, year = "2015", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/2858788.2688510", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Feb 16 12:01:42 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/java2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "Software transactional memory offers an appealing alternative to locks by improving programmability, reliability, and scalability. However, existing STMs are impractical because they add high instrumentation costs and often provide weak progress guarantees and/or semantics. This paper introduces a novel STM called LarkTM that provides three significant features. (1) Its instrumentation adds low overhead except when accesses actually conflict, enabling low single-thread overhead and scaling well on low-contention workloads. (2) It uses eager concurrency control mechanisms, yet naturally supports flexible conflict resolution, enabling strong progress guarantees. (3) It naturally provides strong atomicity semantics at low cost. LarkTM's design works well for low-contention workloads, but adds significant overhead under higher contention, so we design an adaptive version of LarkTM that uses alternative concurrency control for high-contention objects. An implementation and evaluation in a Java virtual machine show that the basic and adaptive versions of LarkTM not only provide low single-thread overhead, but their multithreaded performance compares favorably with existing high-performance STMs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "PPoPP '15 conference proceedings.", } @Article{Zheng:2015:ACC, author = "Zhong Zheng and Zhiying Wang and Mikko Lipasti", title = "Adaptive Cache and Concurrency Allocation on {GPGPUs}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "14", number = "2", pages = "90--93", month = jul # "\slash " # dec, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2014.2359882", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Memory bandwidth is critical to GPGPU performance. Exploiting locality in caches can better utilize memory bandwidth. However, memory requests issued by excessive threads cause cache thrashing and saturate memory bandwidth, degrading performance. In this paper, we propose adaptive cache and concurrency allocation (CCA) to prevent cache thrashing and improve the utilization of bandwidth and computational resources, hence improving performance. According to locality and reuse distance of access patterns in GPGPU program, warps on a stream multiprocessor are dynamically divided into three groups: cached, bypassed, and waiting. The data cache accommodates the footprint of cached warps. Bypassed warps cannot allocate cache lines in the data cache to prevent cache thrashing, but are able to take advantage of available memory bandwidth and computational resource. Waiting warps are de-scheduled. Experimental results show that adaptive CCA can significant improve benchmark performance, with 80 percent harmonic mean IPC improvement over the baseline.", acknowledgement = ack-nhfb, affiliation = "Zheng, Z (Reprint Author), Natl Univ Def Technol, State Key Lab High Performance Comp, Changsha, Hunan, Peoples R China. Zheng, Zhong; Wang, Zhiying, Natl Univ Def Technol, State Key Lab High Performance Comp, Changsha, Hunan, Peoples R China. Zheng, Zhong; Wang, Zhiying, Natl Univ Def Technol, Sch Comp, Changsha, Hunan, Peoples R China. Lipasti, Mikko, Univ Wisconsin, Dept Elect \& Comp Engn, Madison, WI 54706 USA.", author-email = "zheng\_zhong@nudt.edu.cn zywang@nudt.edu.cn mikko@engr.wisc.edu", da = "2019-06-20", doc-delivery-number = "CZ7DC", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "CSC; 863 Program [2012AA010905]; NSFC [61070037, 61272143, 61272144, 61103016, 61202121]; NUDT [B120607]; RFDP [20114307120013]; NSF [CCF-1318298]", funding-text = "This work was partially supported by CSC, 863 Program (2012AA010905), NSFC (61070037, 61272143, 61272144, 61103016, 61202121), NUDT(B120607), RFDP (20114307120013), and NSF (CCF-1318298).", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "access patterns; adaptive cache-and-concurrency allocation; Bandwidth; bandwidth utilization improvement; benchmark performance improvement; Benchmark testing; bypassed warps; cache; cache lines; cache locality; Cache memory; cache storage; cache thrashing prevention; cached warps; CCA; computational resource utilization improvement; concurrency; concurrency control; Concurrent computing; GPGPU; GPGPU performance improvement; graphics processing units; harmonic mean IPC improvement; Instruction sets; memory bandwidth saturation; multi-threading; multiprocessing systems; performance evaluation; Resource management; reuse distance; stream multiprocessor; waiting warp descheduling", number-of-cited-references = "11", research-areas = "Computer Science", times-cited = "4", unique-id = "Zheng:2015:ACC", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Altiparmak:2016:MMF, author = "N. Altiparmak and A. S. Tosun", title = "Multithreaded Maximum Flow Based Optimal Replica Selection Algorithm for Heterogeneous Storage Architectures", journal = j-IEEE-TRANS-COMPUT, volume = "65", number = "5", pages = "1543--1557", month = may, year = "2016", CODEN = "ITCOB4", DOI = "https://doi.org/10.1109/TC.2015.2451620", ISSN = "0018-9340 (print), 1557-9956 (electronic)", ISSN-L = "0018-9340", bibdate = "Fri Apr 15 13:39:43 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Computers", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12", } @Article{Arjomand:2016:BAP, author = "Mohammad Arjomand and Mahmut T. Kandemir and Anand Sivasubramaniam and Chita R. Das", title = "Boosting access parallelism to {PCM}-based main memory", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "695--706", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001211", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Despite its promise as a DRAM main memory replacement, Phase Change Memory (PCM) has high write latencies which can be a serious detriment to its widespread adoption. Apart from slowing down a write request, the consequent high latency can also keep other chips of the same rank, that are not involved in this write, idle for long times. There are several practical considerations that make it difficult to allow subsequent reads and/or writes to be served concurrently from the same chips during the long latency write. This paper proposes and evaluates several novel mechanisms --- re-constructing data from error correction bits instead of waiting for chips currently busy to serve a read, rotating word mappings across chips of a PCM rank, and rotating the mapping of error detection/correction bits across these chips --- to overlap several reads with an ongoing write (RoW) and even a write with an ongoing write (WoW). The paper also presents the necessary micro-architectural enhancements needed to implement these mechanisms, without significantly changing the current interfaces. The resulting PCM access parallelism (PCMap) system incorporating these enhancements, boosts the intra-rank-level parallelism during such writes from a very low baseline value of 2.4 to an average and maximum values of 4.5 and 7.4, respectively (out of a maximum of 8.0), across a wide spectrum of both multiprogrammed and multithreaded workloads. This boost in parallelism results in an average IPC improvement of 15.6\% and 16.7\% for the multiprogrammed and multithreaded workloads, respectively.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", remark = "ISCA '16 conference proceedings.", } @Article{Badamo:2016:IPE, author = "Michael Badamo and Jeff Casarona and Minshu Zhao and Donald Yeung", title = "Identifying Power-Efficient Multicore Cache Hierarchies via Reuse Distance Analysis", journal = j-TOCS, volume = "34", number = "1", pages = "3:1--3:??", month = apr, year = "2016", CODEN = "ACSYEC", DOI = "https://doi.org/10.1145/2851503", ISSN = "0734-2071 (print), 1557-7333 (electronic)", ISSN-L = "0734-2071", bibdate = "Sat May 21 08:09:53 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tocs/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/tocs.bib", abstract = "To enable performance improvements in a power-efficient manner, computer architects have been building CPUs that exploit greater amounts of thread-level parallelism. A key consideration in such CPUs is properly designing the on-chip cache hierarchy. Unfortunately, this can be hard to do, especially for CPUs with high core counts and large amounts of cache. The enormous design space formed by the combinatorial number of ways in which to organize the cache hierarchy makes it difficult to identify power-efficient configurations. Moreover, the problem is exacerbated by the slow speed of architectural simulation, which is the primary means for conducting such design space studies. A powerful tool that can help architects optimize CPU cache hierarchies is reuse distance (RD) analysis. Recent work has extended uniprocessor RD techniques-i.e., by introducing concurrent RD and private-stack RD profiling-to enable analysis of different types of caches in multicore CPUs. Once acquired, parallel locality profiles can predict the performance of numerous cache configurations, permitting highly efficient design space exploration. To date, existing work on multicore RD analysis has focused on developing the profiling techniques and assessing their accuracy. Unfortunately, there has been no work on using RD analysis to optimize CPU performance or power consumption. This article investigates applying multicore RD analysis to identify the most power efficient cache configurations for a multicore CPU. First, we develop analytical models that use the cache-miss counts from parallel locality profiles to estimate CPU performance and power consumption. Although future scalable CPUs will likely employ multithreaded (and even out-of-order) cores, our current study assumes single-threaded in-order cores to simplify the models, allowing us to focus on the cache hierarchy and our RD-based techniques. Second, to demonstrate the utility of our techniques, we apply our models to optimize a large-scale tiled CPU architecture with a two-level cache hierarchy. We show that the most power efficient configuration varies considerably across different benchmarks, and that our locality profiles provide deep insights into why certain configurations are power efficient. We also show that picking the best configuration can provide significant gains, as there is a 2.01x power efficiency spread across our tiled CPU design space. Finally, we validate the accuracy of our techniques using detailed simulation. Among several simulated configurations, our techniques can usually pick the most power efficient configuration, or one that is very close to the best. In addition, across all simulated configurations, we can predict power efficiency with 15.2\% error.", acknowledgement = ack-nhfb, articleno = "3", fjournal = "ACM Transactions on Computer Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J774", } @Article{Balkind:2016:OOS, author = "Jonathan Balkind and Michael McKeown and Yaosheng Fu and Tri Nguyen and Yanqi Zhou and Alexey Lavrov and Mohammad Shahrad and Adi Fuchs and Samuel Payne and Xiaohua Liang and Matthew Matl and David Wentzlaff", title = "{OpenPiton}: an Open Source Manycore Research Framework", journal = j-OPER-SYS-REV, volume = "50", number = "2", pages = "217--232", month = jun, year = "2016", CODEN = "OSRED8", DOI = "https://doi.org/10.1145/2954680.2872414", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Thu Jun 9 17:03:34 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/gnu.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/opersysrev.bib", abstract = "Industry is building larger, more complex, manycore processors on the back of strong institutional knowledge, but academic projects face difficulties in replicating that scale. To alleviate these difficulties and to develop and share knowledge, the community needs open architecture frameworks for simulation, synthesis, and software exploration which support extensibility, scalability, and configurability, alongside an established base of verification tools and supported software. In this paper we present OpenPiton, an open source framework for building scalable architecture research prototypes from 1 core to 500 million cores. OpenPiton is the world's first open source, general-purpose, multithreaded manycore processor and framework. OpenPiton leverages the industry hardened OpenSPARC T1 core with modifications and builds upon it with a scratch-built, scalable uncore creating a flexible, modern manycore design. In addition, OpenPiton provides synthesis and backend scripts for ASIC and FPGA to enable other researchers to bring their designs to implementation. OpenPiton provides a complete verification infrastructure of over 8000 tests, is supported by mature software tools, runs full-stack multiuser Debian Linux, and is written in industry standard Verilog. Multiple implementations of OpenPiton have been created including a taped-out 25-core implementation in IBM's 32nm process and multiple Xilinx FPGA prototypes.", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J597", } @Article{Cao:2016:DBG, author = "Man Cao and Minjia Zhang and Aritra Sengupta and Michael D. Bond", title = "Drinking from both glasses: combining pessimistic and optimistic tracking of cross-thread dependences", journal = j-SIGPLAN, volume = "51", number = "8", pages = "20:1--20:??", month = aug, year = "2016", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/3016078.2851143", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sat Sep 16 10:18:12 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "It is notoriously challenging to develop parallel software systems that are both scalable and correct. Runtime support for parallelism---such as multithreaded record {\&} replay, data race detectors, transactional memory, and enforcement of stronger memory models---helps achieve these goals, but existing commodity solutions slow programs substantially in order to track (i.e., detect or control) an execution's cross-thread dependences accurately. Prior work tracks cross-thread dependences either ``pessimistically,'' slowing every program access, or ``optimistically,'' allowing for lightweight instrumentation of most accesses but dramatically slowing accesses involved in cross-thread dependences. This paper seeks to hybridize pessimistic and optimistic tracking, which is challenging because there exists a fundamental mismatch between pessimistic and optimistic tracking. We address this challenge based on insights about how dependence tracking and program synchronization interact, and introduce a novel approach called hybrid tracking. Hybrid tracking is suitable for building efficient runtime support, which we demonstrate by building hybrid-tracking-based versions of a dependence recorder and a region serializability enforcer. An adaptive, profile-based policy makes runtime decisions about switching between pessimistic and optimistic tracking. Our evaluation shows that hybrid tracking enables runtime support to overcome the performance limitations of both pessimistic and optimistic tracking alone.", acknowledgement = ack-nhfb, articleno = "20", fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "PPoPP '16 conference proceedings.", } @Article{Chen:2016:TMR, author = "Kuan-Hsun Chen and Jian-Jia Chen and Florian Kriebel and Semeen Rehman and Muhammad Shafique and J{\"o}rg Henkel", title = "Task Mapping for Redundant Multithreading in Multi-Cores with Reliability and Performance Heterogeneity", journal = j-IEEE-TRANS-COMPUT, volume = "65", number = "11", pages = "3441--3455", month = nov, year = "2016", CODEN = "ITCOB4", DOI = "https://doi.org/10.1109/TC.2016.2532862", ISSN = "0018-9340 (print), 1557-9956 (electronic)", ISSN-L = "0018-9340", bibdate = "Tue Oct 11 05:14:24 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Computers", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12", } @Article{Creech:2016:TSS, author = "Timothy Creech and Rajeev Barua", title = "Transparently Space Sharing a Multicore Among Multiple Processes", journal = j-TOPC, volume = "3", number = "3", pages = "17:1--17:??", month = dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/3001910", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Mon Dec 26 17:40:41 MST 2016", bibsource = "http://topc.acm.org/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/pvm.bib; https://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "As hardware becomes increasingly parallel and the availability of scalable parallel software improves, the problem of managing multiple multithreaded applications (processes) becomes important. Malleable processes, which can vary the number of threads used as they run, enable sophisticated and flexible resource management. Although many existing applications parallelized for SMPs with parallel runtimes are in fact already malleable, deployed runtime environments provide no interface nor any strategy for intelligently allocating hardware threads or even preventing oversubscription. Prior research methods either depend on profiling applications ahead of time to make good decisions about allocations or do not account for process efficiency at all, leading to poor performance. None of these prior methods have been adapted widely in practice. This article presents the Scheduling and Allocation with Feedback (SCAF) system: a drop-in runtime solution that supports existing malleable applications in making intelligent allocation decisions based on observed efficiency without any changes to semantics, program modification, offline profiling, or even recompilation. Our existing implementation can control most unmodified OpenMP applications. Other malleable threading libraries can also easily be supported with small modifications without requiring application modification or recompilation. In this work, we present the SCAF daemon and a SCAF-aware port of the GNU OpenMP runtime. We present a new technique for estimating process efficiency purely at runtime using available hardware counters and demonstrate its effectiveness in aiding allocation decisions. We evaluated SCAF using NAS NPB parallel benchmarks on five commodity parallel platforms, enumerating architectural features and their effects on our scheme. We measured the benefit of SCAF in terms of sum of speedups improvement (a common metric for multiprogrammed environments) when running all benchmark pairs concurrently compared to equipartitioning-the best existing competing scheme in the literature. We found that SCAF improves on equipartitioning on four out of five machines, showing a mean improvement factor in sum of speedups of 1.04 to 1.11x for benchmark pairs, depending on the machine, and 1.09x on average. Since we are not aware of any widely available tool for equipartitioning, we also compare SCAF against multiprogramming using unmodified OpenMP, which is the only environment available to end users today. SCAF improves on the unmodified OpenMP runtimes for all five machines, with a mean improvement of 1.08 to 2.07x, depending on the machine, and 1.59x on average.", acknowledgement = ack-nhfb, articleno = "17", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Daloze:2016:ETS, author = "Benoit Daloze and Stefan Marr and Daniele Bonetta and Hanspeter M{\"o}ssenb{\"o}ck", title = "Efficient and thread-safe objects for dynamically-typed languages", journal = j-SIGPLAN, volume = "51", number = "10", pages = "642--659", month = oct, year = "2016", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/3022671.2984001", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sat Sep 16 10:18:13 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "We are in the multi-core era. Dynamically-typed languages are in widespread use, but their support for multithreading still lags behind. One of the reasons is that the sophisticated techniques they use to efficiently represent their dynamic object models are often unsafe in multithreaded environments. This paper defines safety requirements for dynamic object models in multithreaded environments. Based on these requirements, a language-agnostic and thread-safe object model is designed that maintains the efficiency of sequential approaches. This is achieved by ensuring that field reads do not require synchronization and field updates only need to synchronize on objects shared between threads. Basing our work on JRuby+Truffle, we show that our safe object model has zero overhead on peak performance for thread-local objects and only 3\% average overhead on parallel benchmarks where field updates require synchronization. Thus, it can be a foundation for safe and efficient multithreaded VMs for a wide range of dynamic languages.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "OOPSLA '16 conference proceedings.", } @Article{Deniz:2016:UML, author = "Etem Deniz and Alper Sen", title = "Using Machine Learning Techniques to Detect Parallel Patterns of Multi-threaded Applications", journal = j-INT-J-PARALLEL-PROG, volume = "44", number = "4", pages = "867--900", month = aug, year = "2016", CODEN = "IJPPE5", DOI = "https://doi.org/10.1007/s10766-015-0396-z", ISSN = "0885-7458 (print), 1573-7640 (electronic)", ISSN-L = "0885-7458", bibdate = "Tue Sep 20 10:50:00 MDT 2016", bibsource = "http://link.springer.com/journal/10766/44/4; https://www.math.utah.edu/pub/tex/bib/intjparallelprogram.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer.com/article/10.1007/s10766-015-0396-z", acknowledgement = ack-nhfb, fjournal = "International Journal of Parallel Programming", journal-URL = "http://link.springer.com/journal/10766", } @Article{Denniston:2016:DH, author = "Tyler Denniston and Shoaib Kamil and Saman Amarasinghe", title = "Distributed {Halide}", journal = j-SIGPLAN, volume = "51", number = "8", pages = "5:1--5:??", month = aug, year = "2016", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/3016078.2851157", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sat Sep 16 10:18:12 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "Many image processing tasks are naturally expressed as a pipeline of small computational kernels known as stencils. Halide is a popular domain-specific language and compiler designed to implement image processing algorithms. Halide uses simple language constructs to express what to compute and a separate scheduling co-language for expressing when and where to perform the computation. This approach has demonstrated performance comparable to or better than hand-optimized code. Until now, however, Halide has been restricted to parallel shared memory execution, limiting its performance for memory-bandwidth-bound pipelines or large-scale image processing tasks. We present an extension to Halide to support distributed-memory parallel execution of complex stencil pipelines. These extensions compose with the existing scheduling constructs in Halide, allowing expression of complex computation and communication strategies. Existing Halide applications can be distributed with minimal changes, allowing programmers to explore the tradeoff between recomputation and communication with little effort. Approximately 10 new of lines code are needed even for a 200 line, 99 stage application. On nine image processing benchmarks, our extensions give up to a 1.4$ \times $ speedup on a single node over regular multithreaded execution with the same number of cores, by mitigating the effects of non-uniform memory access. The distributed benchmarks achieve up to 18$ \times $ speedup on a 16 node testing machine and up to 57$ \times $ speedup on 64 nodes of the NERSC Cori supercomputer.", acknowledgement = ack-nhfb, articleno = "5", fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "PPoPP '16 conference proceedings.", } @Article{Diavastos:2016:ITD, author = "Andreas Diavastos and Pedro Trancoso and Mikel Luj{\'a}n and Ian Watson", title = "Integrating Transactions into the Data-Driven Multi-threading Model Using the {TFlux} Platform", journal = j-INT-J-PARALLEL-PROG, volume = "44", number = "2", pages = "257--277", month = apr, year = "2016", CODEN = "IJPPE5", DOI = "https://doi.org/10.1007/s10766-015-0369-2", ISSN = "0885-7458 (print), 1573-7640 (electronic)", ISSN-L = "0885-7458", bibdate = "Thu Apr 7 12:08:24 MDT 2016", bibsource = "http://link.springer.com/journal/10766/44/2; https://www.math.utah.edu/pub/tex/bib/intjparallelprogram.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer.com/article/10.1007/s10766-015-0369-2", acknowledgement = ack-nhfb, fjournal = "International Journal of Parallel Programming", journal-URL = "http://link.springer.com/journal/10766", } @Article{Dublish:2016:CCG, author = "Saumay Dublish and Vijay Nagarajan and Nigel Topham", title = "Cooperative Caching for {GPUs}", journal = j-TACO, volume = "13", number = "4", pages = "39:1--39:??", month = dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/3001589", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 28 16:24:46 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The rise of general-purpose computing on GPUs has influenced architectural innovation on them. The introduction of an on-chip cache hierarchy is one such innovation. High L1 miss rates on GPUs, however, indicate inefficient cache usage due to myriad factors, such as cache thrashing and extensive multithreading. Such high L1 miss rates in turn place high demands on the shared L2 bandwidth. Extensive congestion in the L2 access path therefore results in high memory access latencies. In memory-intensive applications, these latencies get exposed due to a lack of active compute threads to mask such high latencies. In this article, we aim to reduce the pressure on the shared L2 bandwidth, thereby reducing the memory access latencies that lie in the critical path. We identify significant replication of data among private L1 caches, presenting an opportunity to reuse data among L1s. We further show how this reuse can be exploited via an L1 Cooperative Caching Network (CCN), thereby reducing the bandwidth demand on L2. In the proposed architecture, we connect the L1 caches with a lightweight ring network to facilitate intercore communication of shared data. We show that this technique reduces traffic to the L2 cache by an average of 29\%, freeing up the bandwidth for other accesses. We also show that the CCN reduces the average memory latency by 24\%, thereby reducing core stall cycles by 26\% on average. This translates into an overall performance improvement of 14.7\% on average (and up to 49\%) for applications that exhibit reuse across L1 caches. In doing so, the CCN incurs a nominal area and energy overhead of 1.3\% and 2.5\%, respectively. Notably, the performance improvement with our proposed CCN compares favorably to the performance improvement achieved by simply doubling the number of L2 banks by up to 34\%.", acknowledgement = ack-nhfb, articleno = "39", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924", } @Article{Eiras-Franco:2016:MSP, author = "Carlos Eiras-Franco and Ver{\'o}nica Bol{\'o}n-Canedo and Sabela Ramos and Jorge Gonz{\'a}lez-Dom{\'\i}nguez and Amparo Alonso-Betanzos and Juan Touri{\~n}o", title = "Multithreaded and {Spark} parallelization of feature selection filters", journal = j-J-COMPUT-SCI, volume = "17 (part 3)", pages = "609--619", month = nov, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1016/j.jocs.2016.07.002", ISSN = "1877-7503 (print), 1877-7511 (electronic)", ISSN-L = "1877-7503", bibdate = "Tue Sep 19 13:54:18 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jcomputsci.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "https://www.sciencedirect.com/science/article/pii/S1877750316301107", acknowledgement = ack-nhfb, ajournal = "J. Comput. Sci.", fjournal = "Journal of Computational Science", journal-URL = "https://www.sciencedirect.com/journal/journal-of-computational-science", } @Article{Evtyushkin:2016:UMC, author = "Dmitry Evtyushkin and Dmitry Ponomarev and Nael Abu-Ghazaleh", title = "Understanding and Mitigating Covert Channels Through Branch Predictors", journal = j-TACO, volume = "13", number = "1", pages = "10:1--10:??", month = apr, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2870636", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Apr 5 16:27:36 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Covert channels through shared processor resources provide secret communication between two malicious processes: the trojan and the spy. In this article, we classify, analyze, and compare covert channels through dynamic branch prediction units in modern processors. Through experiments on a real hardware platform, we compare contention-based channel and the channel that is based on exploiting the branch predictor's residual state. We analyze these channels in SMT and single-threaded environments under both clean and noisy conditions. Our results show that the residual state-based channel provides a cleaner signal and is effective even in noisy execution environments with another application sharing the same physical core with the trojan and the spy. We also estimate the capacity of the branch predictor covert channels and describe a software-only mitigation technique that is based on randomizing the state of the predictor tables on context switches. We show that this protection eliminates all covert channels through the branch prediction unit with minimal impact on performance.", acknowledgement = ack-nhfb, articleno = "10", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924", } @Article{Feliu:2016:BAL, author = "J. Feliu and J. Sahuquillo and S. Petit and J. Duato", title = "Bandwidth-Aware On-Line Scheduling in {SMT} Multicores", journal = j-IEEE-TRANS-COMPUT, volume = "65", number = "2", pages = "422--434", month = "????", year = "2016", CODEN = "ITCOB4", DOI = "https://doi.org/10.1109/TC.2015.2428694", ISSN = "0018-9340 (print), 1557-9956 (electronic)", ISSN-L = "0018-9340", bibdate = "Tue Jan 19 07:06:51 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Computers", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12", } @Article{Harish:2016:PIK, author = "Pawan Harish and Mentar Mahmudi and Beno{\^\i}t {Le Callennec} and Ronan Boulic", title = "Parallel Inverse Kinematics for Multithreaded Architectures", journal = j-TOG, volume = "35", number = "2", pages = "19:1--19:??", month = may, year = "2016", CODEN = "ATGRDF", DOI = "https://doi.org/10.1145/2887740", ISSN = "0730-0301 (print), 1557-7368 (electronic)", ISSN-L = "0730-0301", bibdate = "Mon Jun 20 09:13:19 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tog/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/tog.bib", abstract = "In this article, we present a parallel prioritized Jacobian-based inverse kinematics algorithm for multithreaded architectures. We solve damped least squares inverse kinematics using a parallel line search by identifying and sampling critical input parameters. Parallel competing execution paths are spawned for each parameter in order to select the optimum that minimizes the error criteria. Our algorithm is highly scalable and can handle complex articulated bodies at interactive frame rates. We show results on complex skeletons consisting of more than 600 degrees of freedom while being controlled using multiple end effectors. We implement the algorithm both on multicore and GPU architectures and demonstrate how the GPU can further exploit fine-grain parallelism not directly available on a multicore processor. Our implementations are 10 to 150 times faster compared to a state-of-the-art serial implementation while providing higher accuracy. We also demonstrate the scalability of the algorithm over multiple scenarios and explore the GPU implementation in detail.", acknowledgement = ack-nhfb, articleno = "19", fjournal = "ACM Transactions on Graphics", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J778", } @Article{Hashemi:2016:EEB, author = "Milad Hashemi and Debbie Marr and Doug Carmean and Yale N. Patt", title = "Efficient Execution of Bursty Applications", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "2", pages = "85--88", month = jul # "\slash " # dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2456013", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The performance of user-facing applications is critical to client platforms. Many of these applications are event-driven and exhibit ``bursty'' behavior: the application is generally idle but generates bursts of activity in response to human interaction. We study one example of a bursty application, web-browsers, and produce two important insights: (1) Activity bursts contain false parallelism, bringing many cores out of a deep sleep to inefficiently render a single webpage, and (2) these bursts are highly compute driven, and thus scale nearly linearly with frequency. We show average performance gains/energy reductions of 14\%/17\% respectively on real hardware by statically moving threads from multiple cores to a single core. We then propose dynamic hardware driven thread migration and scheduling enhancements that detect these bursts, leading to further benefits.", acknowledgement = ack-nhfb, affiliation = "Hashemi, M (Reprint Author), Univ Texas Austin, Elect \& Comp Engn, Austin, TX 78701 USA. Hashemi, Milad; Patt, Yale N., Univ Texas Austin, Elect \& Comp Engn, Austin, TX 78701 USA. Marr, Debbie, Intel Corp, Intel Labs, Portland, OR USA. Carmean, Doug, Microsoft, Microsoft Res, Seattle, WA USA.", author-email = "miladh@hps.utexas.edu debbie.marr@intel.com dcarmean@microsoft.com patt@hps.utexas.edu", da = "2019-06-20", doc-delivery-number = "EH9MM", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Intel Corporation; Cockrell Foundation; HPS Research Group", funding-text = "The authors thank Intel Corporation and the Cockrell Foundation for their continued generous financial support of the HPS Research Group.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Browsers; bursty applications; dynamic hardware; Energy; energy reductions; Hardware; human computer interaction; human interaction; Instruction sets; Internet; Loading; multi-threading; Multicore processing; multiple cores; multiprocessing systems; online front-ends; Operating systems; performance; performance evaluation; performance gains; power aware computing; thread migration; thread scheduling; Web-browsers; Webpage; webpages; webpages, thread scheduling", number-of-cited-references = "13", research-areas = "Computer Science", times-cited = "0", unique-id = "Hashemi:2016:EEB", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Hu:2016:TDM, author = "Qi Hu and Peng Liu and Michael C. Huang", title = "Threads and Data Mapping: Affinity Analysis for Traffic Reduction", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "2", pages = "133--136", month = jul # "\slash " # dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2451172", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Jun 20 17:18:18 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Modern processors spend significant amount of time and energy moving data. With the increase in core count, the relative importance of such latency and energy expenditure will only increase with time. Inter-core communication traffic when executing a multithreaded application is one such source of latency and energy expenditure. This traffic is influenced by the mapping of threads and data onto multicore systems. This paper investigates the impact of threads and data mapping on traffic in a chip-multiprocessor, and exploits the potential for traffic reduction through threads and data mapping. Based on the analysis and estimation of the lowest traffic, we propose a threads and data mapping mechanism to approach the lowest traffic. The mapping takes both the correlation among threads and the affinity of data with individual threads into account, and results in significant traffic reduction and energy savings.", acknowledgement = ack-nhfb, affiliation = "Liu, P (Reprint Author), Zhejiang Univ, Coll Informat Sci \& Elect Engn, Hangzhou 310027, Peoples R China. Hu, Qi; Liu, Peng, Zhejiang Univ, Coll Informat Sci \& Elect Engn, Hangzhou 310027, Peoples R China. Huang, Michael C., Univ Rochester, Dept Elect \& Comp Engn, 601 Elmwood Ave, Rochester, NY 14627 USA.", author-email = "huqi\_isee@zju.edu.cn liupeng@zju.edu.cn michael.huang@rochester.edu", da = "2019-06-20", doc-delivery-number = "EH9MM", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "NSFC [61028004]; US National Science Foundation (NSF) [1217662, 1255729]; Open Project Program of the State Key Laboratory of Mathematical Engineering and Advanced Computing [2014A08, 2015A09]", funding-text = "This work was supported by NSFC under grant 61028004, and also in part by US National Science Foundation (NSF) under grants 1217662 and 1255729, and the Open Project Program of the State Key Laboratory of Mathematical Engineering and Advanced Computing under grants 2014A08 and 2015A09. P. Liu is the corresponding author.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Mapping; memory; multicore; network-on-chip", keywords-plus = "NETWORKS; CACHES; CHIP", number-of-cited-references = "11", oa = "Bronze", research-areas = "Computer Science", times-cited = "0", unique-id = "Hu:2016:TDM", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Huang:2016:MCR, author = "Shiyou Huang and Jeff Huang", title = "Maximal causality reduction for {TSO} and {PSO}", journal = j-SIGPLAN, volume = "51", number = "10", pages = "447--461", month = oct, year = "2016", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/3022671.2984025", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sat Sep 16 10:18:13 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "Verifying concurrent programs is challenging due to the exponentially large thread interleaving space. The problem is exacerbated by relaxed memory models such as Total Store Order (TSO) and Partial Store Order (PSO) which further explode the interleaving space by reordering instructions. A recent advance, Maximal Causality Reduction (MCR), has shown great promise to improve verification effectiveness by maximally reducing redundant explorations. However, the original MCR only works for the Sequential Consistency (SC) memory model, but not for TSO and PSO. In this paper, we develop novel extensions to MCR by solving two key problems under TSO and PSO: (1) generating interleavings that can reach new states by encoding the operational semantics of TSO and PSO with first-order logical constraints and solving them with SMT solvers, and (2) enforcing TSO and PSO interleavings by developing novel replay algorithms that allow executions out of the program order. We show that our approach successfully enables MCR to effectively explore TSO and PSO interleavings. We have compared our approach with a recent Dynamic Partial Order Reduction (DPOR) algorithm for TSO and PSO and a SAT-based stateless model checking approach. Our results show that our approach is much more effective than the other approaches for both state-space exploration and bug finding --- on average it explores 5-10X fewer executions and finds many bugs that the other tools cannot find.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "OOPSLA '16 conference proceedings.", } @Article{Huang:2016:PMR, author = "Jeff Huang and Arun K. Rajagopalan", title = "Precise and maximal race detection from incomplete traces", journal = j-SIGPLAN, volume = "51", number = "10", pages = "462--476", month = oct, year = "2016", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/3022671.2984024", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sat Sep 16 10:18:13 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "We present RDIT, a novel dynamic technique to detect data races in multithreaded programs with incomplete trace information, i.e., in the presence of missing events. RDIT is both precise and maximal: it does not report any false alarms and it detects a maximal set of true traces from the observed incomplete trace. RDIT is underpinned by a sound BarrierPair model that abstracts away the missing events by capturing the invocation data of their enclosing methods. By making the least conservative abstraction that a missing method introduces synchronization only when it has a memory address in scope that overlaps with other events or other missing methods, and by formulating maximal thread causality as logical constraints, RDIT guarantees to precisely detect races with maximal capability. RDIT has been applied in seven real-world large concurrent systems and has detected dozens of true races with zero false alarms. Comparatively, existing algorithms such as Happens-Before, Causal-Precedes, and Maximal-Causality which are known to be precise all report many false alarms when missing synchronizations.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "OOPSLA '16 conference proceedings.", } @Article{Jiang:2016:TLH, author = "Chuntao Jiang and Zhibin Yu and Lieven Eeckhout and Hai Jin and Xiaofei Liao and Chengzhong Xu", title = "Two-Level Hybrid Sampled Simulation of Multithreaded Applications", journal = j-TACO, volume = "12", number = "4", pages = "39:1--39:??", month = jan, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2818353", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Feb 16 15:36:38 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Sampled microarchitectural simulation of single-threaded applications is mature technology for over a decade now. Sampling multithreaded applications, on the other hand, is much more complicated. Not until very recently have researchers proposed solutions for sampled simulation of multithreaded applications. Time-Based Sampling (TBS) samples multithreaded application execution based on time---not instructions as is typically done for single-threaded applications---yielding estimates for a multithreaded application's execution time. In this article, we revisit and analyze previously proposed TBS approaches (periodic and cantor fractal based sampling), and we obtain a number of novel and surprising insights, such as (i) accurately estimating fast-forwarding IPC, that is, performance in-between sampling units, is more important than accurately estimating sample IPC, that is, performance within the sampling units; (ii) fast-forwarding IPC estimation accuracy is determined by both the sampling unit distribution and how to use the sampling units to predict fast-forwarding IPC; and (iii) cantor sampling is more accurate at small sampling unit sizes, whereas periodic is more accurate at large sampling unit sizes. These insights lead to the development of Two-level Hybrid Sampling (THS), a novel sampling methodology for multithreaded applications that combines periodic sampling's accuracy at large time scales (i.e., uniformly selecting coarse-grain sampling units across the entire program execution) with cantor sampling's accuracy at small time scales (i.e., the ability to accurately predict fast-forwarding IPC in-between small sampling units). The clustered occurrence of small sampling units under cantor sampling also enables shortened warmup and thus enhanced simulation speed. Overall, THS achieves an average absolute execution time prediction error of 4\% while yielding an average simulation speedup of 40 $ \times $ compared to detailed simulation, which is both more accurate and faster than the current state-of-the-art. Case studies illustrate THS' ability to accurately predict relative performance differences across the design space.", acknowledgement = ack-nhfb, articleno = "39", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924", } @Article{Jung:2016:LPS, author = "Daejin Jung and Sheng Li and Jung Ho Ahn", title = "Large Pages on Steroids: Small Ideas to Accelerate Big Memory Applications", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "2", pages = "101--104", month = jul # "\slash " # dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2495103", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Utilizing small (e.g., 4 KB) pages incurs frequent TLB misses on modern big memory applications, substantially degrading the performance of the system. Large (e.g., 1 GB) pages or direct segments can alleviate this penalty due to page table walks, but at the same time such a strategy exposes the organizational and operational details of modern DRAM-based memory systems to applications. Row-buffer conflicts caused by accesses heading to the same DRAM bank but different rows from multiple threads are regarded as the main culprits behind the very large gaps between peak and achieved main memory throughput, but hardware-based approaches in memory controllers have achieved only limited success whereas existing proposals that change memory allocators cannot be applied to large pages or direct segments. In this paper, we propose a set of application-level techniques to improve the effective main memory bandwidth. The techniques stem from the two key observations that (1) each thread of an application exclusively accesses certain datasets for a short or long period of time, and (2) superfluous memory reads originating from a cache's write allocation policy can be avoided if scatters during the data shuffling pass through intermediate cache-friendly buffers. Experiments with a contemporary x86 server show that combining large pages with the proposed address linearization, bank coloring, and write streaming techniques improves the performance of the three big memory applications of high-throughput key-value store, fast-Fourier transform, and radix sort by 37.6, 22.9, and 68.1 percent, respectively.", acknowledgement = ack-nhfb, affiliation = "Jung, D (Reprint Author), Seoul Natl Univ, Dept Transdisciplinary Studies, Seoul, South Korea. Jung, Daejin; Ahn, Jung Ho, Seoul Natl Univ, Dept Transdisciplinary Studies, Seoul, South Korea. Li, Sheng, Intel Labs, Santa Clara, CA USA. Ahn, Jung Ho, Seoul Natl Univ, Big Data Inst, Seoul, South Korea.", author-email = "haidj@snu.ac.kr sheng.r.li@intel.com gajh@snu.ac.kr", da = "2019-06-20", doc-delivery-number = "EH9MM", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "National Research Foundation of Korea - Korea government [NRF-2014R1A2A1A11052936, NRF-2012M3A9D1054622]", funding-text = "The authors thank Jongwook Chung and Jaeyoon Choi on their contributions to application writing and experiments. This work was partially supported by the National Research Foundation of Korea grant funded by the Korea government (NRF-2014R1A2A1A11052936 and NRF-2012M3A9D1054622). Jung Ho Ahn is also with Big Data Institute, Seoul National University.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "address linearization; application-level techniques; Bandwidth; bank coloring; big memory applications; cache storage; cache write allocation policy; cache-friendly buffers; data shuffling; DRAM bank; DRAM chips; DRAM-based memory; fast-Fourier transform; high-throughput key-value store; Instruction sets; large pages; memory allocators; memory bandwidth; memory controllers; Memory management; memory throughput; multi-threading; multiple threads; Performance gain; Physical-to-DRAM address mapping; radix sort; Random access memory; row-buffer conflicts; Servers; superfluous memory reads; write streaming", number-of-cited-references = "14", ORCID-numbers = "Ahn, Jung Ho/0000-0003-1733-1394", research-areas = "Computer Science", times-cited = "0", unique-id = "Jung:2016:LPS", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Kalayappan:2016:FRT, author = "Rajshekar Kalayappan and Smruti R. Sarangi", title = "{FluidCheck}: a Redundant Threading-Based Approach for Reliable Execution in Manycore Processors", journal = j-TACO, volume = "12", number = "4", pages = "55:1--55:??", month = jan, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2842620", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Feb 16 15:36:38 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Soft errors have become a serious cause of concern with reducing feature sizes. The ability to accommodate complex, Simultaneous Multithreading (SMT) cores on a single chip presents a unique opportunity to achieve reliable execution, safe from soft errors, with low performance penalties. In this context, we present FluidCheck, a checker architecture that allows highly flexible assignment and migration of checking duties across cores. In this article, we present a mechanism to dynamically use the resources of SMT cores for checking the results of other threads, and propose a variety of heuristics for migration of such checker threads across cores. Secondly, to make the process of checking more efficient, we propose a set of architectural enhancements that reduce power consumption, decrease the length of the critical path, and reduce the load on the Network-on-Chip (NoC). Based on our observations, we design a 16 core system for running SPEC2006 based bag-of-tasks applications. Our experiments demonstrate that fully reliable execution can be attained with a mere 27\% slowdown, surpassing traditional redundant threading based techniques by roughly 42\%.", acknowledgement = ack-nhfb, articleno = "55", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924", } @Article{Kim:2016:SEA, author = "Youngho Kim and Joong Chae Na and Heejin Park and Jeong Seop Sim", title = "A space-efficient alphabet-independent {Four-Russians}' lookup table and a multithreaded {Four-Russians}' edit distance algorithm", journal = j-THEOR-COMP-SCI, volume = "656 (Part B)", number = "??", pages = "173--179", day = "20", month = dec, year = "2016", CODEN = "TCSCDI", ISSN = "0304-3975 (print), 1879-2294 (electronic)", ISSN-L = "0304-3975", bibdate = "Fri Dec 9 12:17:02 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/tcs2015.bib", URL = "http://www.sciencedirect.com/science/article/pii/S0304397516300676", acknowledgement = ack-nhfb, fjournal = "Theoretical Computer Science", journal-URL = "http://www.sciencedirect.com/science/journal/03043975/", } @Article{Kong:2016:GSB, author = "Weiqiang Kong and Gang Hou and Xiangpei Hu and Takahiro Ando and Kenji Hisazumi and Akira Fukuda", title = "{Garakabu2}: an {SMT}-based bounded model checker for {HSTM} designs in {ZIPC}", journal = j-J-INFO-SEC-APPL, volume = "31", number = "??", pages = "61--74", month = dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1016/j.jisa.2016.08.001", ISSN = "2214-2126", ISSN-L = "2214-2126", bibdate = "Sun May 8 11:59:12 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/jinfosecappl.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.sciencedirect.com/science/article/pii/S2214212616301600", acknowledgement = ack-nhfb, ajournal = "J. Info. Sec. Appl.", fjournal = "Journal of Information Security and Applications (JISA)", journal-URL = "http://www.sciencedirect.com/science/journal/22142126", } @Article{Kutsuna:2016:ARM, author = "Takuro Kutsuna and Yoshinao Ishii", title = "Abstraction and refinement of mathematical functions toward {SMT}-based test-case generation", journal = j-INT-J-SOFTW-TOOLS-TECHNOL-TRANSFER, volume = "18", number = "1", pages = "109--120", month = feb, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1007/s10009-015-0389-7", ISSN = "1433-2779 (print), 1433-2787 (electronic)", ISSN-L = "1433-2779", bibdate = "Mon Jan 25 08:12:53 MST 2016", bibsource = "http://link.springer.com/journal/10009/18/1; https://www.math.utah.edu/pub/tex/bib/elefunt.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sttt.bib", URL = "http://link.springer.com/article/10.1007/s10009-015-0389-7", acknowledgement = ack-nhfb, fjournal = "International Journal on Software Tools for Technology Transfer (STTT)", journal-URL = "http://link.springer.com/journal/10009", } @Article{Lai:2016:QMD, author = "Bo-Cheng Charles Lai and Luis Garrido Platero and Hsien-Kai Kuo", title = "A Quantitative Method to Data Reuse Patterns of {SIMT} Applications", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "2", pages = "73--76", month = jul # "\slash " # dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2491279", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Jun 25 07:41:05 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Understanding data reuse patterns of a computing system is crucial to effective design optimization. The emerging Single Instruction Multiple Threads (SIMT) processor adopts a programming model that is fundamentally disparate from conventional scalar processors. There is a lack of analytical approaches to quantify the data reuse of SIMT applications. This paper presents a quantitative method to study the data reuse inherent to SIMT applications. A metric, Data Reuse Degree, is defined to measure the amount of reused data between memory references, and associate each data reuse degree to a temporal distance representing the virtual time of the execution process. The experiments are performed on an abstracted SIMT processor that considers the programming model and runtime specifics. The experiments illustrate diverse data reuse patterns of SIMT applications and explore the impacts of architectural limitations.", acknowledgement = ack-nhfb, affiliation = "Lai, BCC (Reprint Author), Natl Chiao Tung Univ, Dept Elect Engn, Hsinchu 300, Taiwan. Lai, Bo-Cheng Charles, Natl Chiao Tung Univ, Dept Elect Engn, Hsinchu 300, Taiwan. Platero, Luis Garrido, Barcelona Super Comp Ctr, Barcelona, Spain. Kuo, Hsien-Kai, MediaTek Inc, Hsinchu, Taiwan.", author-email = "bclai@mail.nctu.edu.tw luis.garrido.platero@gmail.com hsienkai.kuo@gmail.com", da = "2019-06-20", doc-delivery-number = "EH9MM", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "MOST [104-2221-E-009-079]", funding-text = "This project was supported by MOST grant 104-2221-E-009-079.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "architectural limitations; cache memory; Cache memory; computing system; data analysis; data reuse degree; data reuse patterns; design optimization; execution process; Graphics processing units; Instruction sets; Measurement; Memory management; multi-threading; Parallel architectures; Parallel architectures, cache memory, parallel processing; parallel processing; Parallel processing; programming model; scalar processors; SIMT applications; SIMT processors; single-instruction multiple-threads processors; virtual time", number-of-cited-references = "11", research-areas = "Computer Science", times-cited = "0", unique-id = "Lai:2016:QMD", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Li:2016:HBG, author = "Jing Li and Hung-Wei Tseng and Chunbin Lin and Yannis Papakonstantinou and Steven Swanson", title = "{HippogriffDB}: balancing {I/O} and {GPU} bandwidth in big data analytics", journal = j-PROC-VLDB-ENDOWMENT, volume = "9", number = "14", pages = "1647--1658", month = oct, year = "2016", CODEN = "????", ISSN = "2150-8097", bibdate = "Wed Oct 12 10:14:56 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "As data sets grow and conventional processor performance scaling slows, data analytics move towards heterogeneous architectures that incorporate hardware accelerators (notably GPUs) to continue scaling performance. However, existing GPU-based databases fail to deal with big data applications efficiently: their execution model suffers from scalability limitations on GPUs whose memory capacity is limited; existing systems fail to consider the discrepancy between fast GPUs and slow storage, which can counteract the benefit of GPU accelerators. In this paper, we propose HippogriffDB, an efficient, scalable GPU-accelerated OLAP system. It tackles the bandwidth discrepancy using compression and an optimized data transfer path. HippogriffDB stores tables in a compressed format and uses the GPU for decompression, trading GPU cycles for the improved I/O bandwidth. To improve the data transfer efficiency, HippogriffDB introduces a peer-to-peer, multi-threaded data transfer mechanism, directly transferring data from the SSD to the GPU. HippogriffDB adopts a query-over-block execution model that provides scalability using a stream-based approach. The model improves kernel efficiency with the operator fusion and double buffering mechanism. We have implemented HippogriffDB using an NVMe SSD, which talks directly to a commercial GPU. Results on two popular benchmarks demonstrate its scalability and efficiency. HippogriffDB outperforms existing GPU-based databases (YDB) and in-memory data analytics (MonetDB) by 1-2 orders of magnitude.", acknowledgement = ack-nhfb, fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Ling:2016:MTH, author = "Cheng Ling and Tsuyoshi Hamada and Jingyang Gao and Guoguang Zhao and Donghong Sun and Weifeng Shi", title = "{MrBayes tgMC 3++}: a High Performance and Resource-Efficient {GPU}-Oriented Phylogenetic Analysis Method", journal = j-TCBB, volume = "13", number = "5", pages = "845--854", month = sep, year = "2016", CODEN = "ITCBCY", DOI = "https://doi.org/10.1109/TCBB.2015.2495202", ISSN = "1545-5963 (print), 1557-9964 (electronic)", ISSN-L = "1545-5963", bibdate = "Fri Dec 30 16:19:30 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/tcbb.bib", abstract = "MrBayes is a widespread phylogenetic inference tool harnessing empirical evolutionary models and Bayesian statistics. However, the computational cost on the likelihood estimation is very expensive, resulting in undesirably long execution time. Although a number of multi-threaded optimizations have been proposed to speed up MrBayes, there are bottlenecks that severely limit the GPU thread-level parallelism of likelihood estimations. This study proposes a high performance and resource-efficient method for GPU-oriented parallelization of likelihood estimations. Instead of having to rely on empirical programming, the proposed novel decomposition storage model implements high performance data transfers implicitly. In terms of performance improvement, a speedup factor of up to 178 can be achieved on the analysis of simulated datasets by four Tesla K40 cards. In comparison to the other publicly available GPU-oriented MrBayes, the tgMC$^3$ ++ method proposed herein outperforms the tgMC$^3$ v1.0, nMC$^3$ v2.1.1 and oMC$^3$ v1.00 methods by speedup factors of up to 1.6, 1.9 and 2.9, respectively. Moreover, tgMC$^3$ ++ supports more evolutionary models and gamma categories, which previous GPU-oriented methods fail to take into analysis.", acknowledgement = ack-nhfb, fjournal = "IEEE/ACM Transactions on Computational Biology and Bioinformatics", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J954", } @Article{Liu:2016:PSE, author = "Yongchao Liu and Thomas Hankeln and Bertil Schmidt", title = "Parallel and space-efficient construction of {Burrows--Wheeler} transform and suffix array for big genome data", journal = j-TCBB, volume = "13", number = "3", pages = "592--598", month = may, year = "2016", CODEN = "ITCBCY", ISSN = "1545-5963 (print), 1557-9964 (electronic)", ISSN-L = "1545-5963", bibdate = "Mon Aug 29 06:50:39 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/tcbb.bib", abstract = "Next-generation sequencing technologies have led to the sequencing of more and more genomes, propelling related research into the era of big data. In this paper, we present ParaBWT, a parallelized Burrows--Wheeler transform (BWT) and suffix array construction algorithm for big genome data. In ParaBWT, we have investigated a progressive construction approach to constructing the BWT of single genome sequences in linear space complexity, but with a small constant factor. This approach has been further parallelized using multi-threading based on a master-slave coprocessing model. After gaining the BWT, the suffix array is constructed in a memory-efficient manner. The performance of ParaBWT has been evaluated using two sequences generated from two human genome assemblies: the Ensembl Homo sapiens assembly and the human reference genome. Our performance comparison to FMD-index and Bwt-disk reveals that on 12 CPU cores, ParaBWT runs up to $ 2.2 \times $ faster than FMD-index and up to $ 99.0 \times $ faster than Bwt-disk. BWT construction algorithms for very long genomic sequences are time consuming and (due to their incremental nature) inherently difficult to parallelize. Thus, their parallelization is challenging and even relatively small speedups like the ones of our method over FMD-index are of high importance to research. ParaBWT is written in C++, and is freely available at http://parabwt.sourceforge.net.", acknowledgement = ack-nhfb, fjournal = "IEEE/ACM Transactions on Computational Biology and Bioinformatics", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J954", } @Article{Liu:2016:SEA, author = "Qixiao Liu and Miquel Moreto and Jaume Abella and Francisco J. Cazorla and Daniel A. Jimenez and Mateo Valero", title = "Sensible Energy Accounting with Abstract Metering for Multicore Systems", journal = j-TACO, volume = "12", number = "4", pages = "60:1--60:??", month = jan, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2842616", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Feb 16 15:36:38 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Chip multicore processors (CMPs) are the preferred processing platform across different domains such as data centers, real-time systems, and mobile devices. In all those domains, energy is arguably the most expensive resource in a computing system. Accurately quantifying energy usage in a multicore environment presents a challenge as well as an opportunity for optimization. Standard metering approaches are not capable of delivering consistent results with shared resources, since the same task with the same inputs may have different energy consumption based on the mix of co-running tasks. However, it is reasonable for data-center operators to charge on the basis of estimated energy usage rather than time since energy is more correlated with their actual cost. This article introduces the concept of Sensible Energy Accounting (SEA). For a task running in a multicore system, SEA accurately estimates the energy the task would have consumed running in isolation with a given fraction of the CMP shared resources. We explain the potential benefits of SEA in different domains and describe two hardware techniques to implement it for a shared last-level cache and on-core resources in SMT processors. Moreover, with SEA, an energy-aware scheduler can find a highly efficient on-chip resource assignment, reducing by up to 39\% the total processor energy for a 4-core system.", acknowledgement = ack-nhfb, articleno = "60", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924", } @Article{Liu:2016:TAA, author = "Peng Liu and Jiyang Yu and Michael C. Huang", title = "Thread-Aware Adaptive Prefetcher on Multicore Systems: Improving the Performance for Multithreaded Workloads", journal = j-TACO, volume = "13", number = "1", pages = "13:1--13:??", month = apr, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2890505", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Apr 5 16:27:36 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Most processors employ hardware data prefetching techniques to hide memory access latencies. However, the prefetching requests from different threads on a multicore processor can cause severe interference with prefetching and/or demand requests of others. The data prefetching can lead to significant performance degradation due to shared resource contention on shared memory multicore systems. This article proposes a thread-aware data prefetching mechanism based on low-overhead runtime information to tune prefetching modes and aggressiveness, mitigating the resource contention in the memory system. Our solution has three new components: (1) a self-tuning prefetcher that uses runtime feedback to dynamically adjust data prefetching modes and arguments of each thread, (2) a filtering mechanism that informs the hardware about which prefetching request can cause shared data invalidation and should be discarded, and (3) a limiter thread acceleration mechanism to estimate and accelerate the critical thread which has the longest completion time in the parallel region of execution. On a set of multithreaded parallel benchmarks, our thread-aware data prefetching mechanism improves the overall performance of 64-core system by 13\% over a multimode prefetch baseline system with two-level cache organization and conventional modified, exclusive, shared, and invalid-based directory coherence protocol. We compare our approach with the feedback directed prefetching technique and find that it provides 9\% performance improvement on multicore systems, while saving the memory bandwidth consumption.", acknowledgement = ack-nhfb, articleno = "13", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924", } @Article{Lozi:2016:FPL, author = "Jean-Pierre Lozi and Florian David and Ga{\"e}l Thomas and Julia Lawall and Gilles Muller", title = "Fast and Portable Locking for Multicore Architectures", journal = j-TOCS, volume = "33", number = "4", pages = "13:1--13:??", month = jan, year = "2016", CODEN = "ACSYEC", DOI = "https://doi.org/10.1145/2845079", ISSN = "0734-2071 (print), 1557-7333 (electronic)", ISSN-L = "0734-2071", bibdate = "Wed Jan 6 06:45:30 MST 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tocs/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/tocs.bib", abstract = "The scalability of multithreaded applications on current multicore systems is hampered by the performance of lock algorithms, due to the costs of access contention and cache misses. The main contribution presented in this article is a new locking technique, Remote Core Locking (RCL), that aims to accelerate the execution of critical sections in legacy applications on multicore architectures. The idea of RCL is to replace lock acquisitions by optimized remote procedure calls to a dedicated server hardware thread. RCL limits the performance collapse observed with other lock algorithms when many threads try to acquire a lock concurrently and removes the need to transfer lock-protected shared data to the hardware thread acquiring the lock, because such data can typically remain in the server's cache. Other contributions presented in this article include a profiler that identifies the locks that are the bottlenecks in multithreaded applications and that can thus benefit from RCL, and a reengineering tool that transforms POSIX lock acquisitions into RCL locks. Eighteen applications were used to evaluate RCL: the nine applications of the SPLASH-2 benchmark suite, the seven applications of the Phoenix 2 benchmark suite, Memcached, and Berkeley DB with a TPC-C client. Eight of these applications are unable to scale because of locks and benefit from RCL on an x86 machine with four AMD Opteron processors and 48 hardware threads. By using RCL instead of Linux POSIX locks, performance is improved by up to 2.5 times on Memcached, and up to 11.6 times on Berkeley DB with the TPC-C client. On a SPARC machine with two Sun Ultrasparc T2+ processors and 128 hardware threads, three applications benefit from RCL. In particular, performance is improved by up to 1.3 times with respect to Solaris POSIX locks on Memcached, and up to 7.9 times on Berkeley DB with the TPC-C client.", acknowledgement = ack-nhfb, articleno = "13", fjournal = "ACM Transactions on Computer Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J774", } @Article{Lu:2016:VCV, author = "Yaojie Lu and Seyedamin Rooholamin and Sotirios G. Ziavras", title = "Vector Coprocessor Virtualization for Simultaneous Multithreading", journal = j-TECS, volume = "15", number = "3", pages = "57:1--57:??", month = jul, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2898364", ISSN = "1539-9087 (print), 1558-3465 (electronic)", ISSN-L = "1539-9087", bibdate = "Thu Jul 21 17:18:13 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/tecs.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", abstract = "Vector coprocessors (VPs), commonly being assigned exclusively to a single thread/core, are not often performance and energy efficient due to mismatches with the vector needs of individual applications. We present in this article an easy-to-implement VP virtualization technique that, when applied, enables a multithreaded VP to simultaneously execute multiple threads of similar or arbitrary vector lengths to achieve improved aggregate utilization. With a vector register file (VRF) virtualization technique invented to dynamically allocate physical vector registers to threads, our VP virtualization approach improves programmer productivity by providing at runtime a distinct physical register name space to each competing thread, thus eliminating the need to solve register-name conflicts statically. We applied our virtualization technique to a multithreaded VP and prototyped an FPGA-based multicore processor system that supports VP sharing as well as power gating for better energy efficiency. Under the dynamic creation of disparate threads, our benchmarking results show impressive VP speedups of up to 333\% and total energy savings of up to 37\% with proper thread scheduling and power gating compared to a similar-sized system that allows VP access to just one thread at a time.", acknowledgement = ack-nhfb, articleno = "57", fjournal = "ACM Transactions on Embedded Computing Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J840", } @Article{Machado:2016:CDD, author = "Nuno Machado and Daniel Quinta and Brandon Lucia and Lu{\'\i}s Rodrigues", title = "Concurrency Debugging with Differential Schedule Projections", journal = j-TOSEM, volume = "25", number = "2", pages = "14:1--14:??", month = may, year = "2016", CODEN = "ATSMER", DOI = "https://doi.org/10.1145/2885495", ISSN = "1049-331X (print), 1557-7392 (electronic)", ISSN-L = "1049-331X", bibdate = "Mon May 16 16:22:08 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/tosem/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/tosem.bib", abstract = "We present Symbiosis: a concurrency debugging technique based on novel differential schedule projections (DSPs). A DSP shows the small set of memory operations and dataflows responsible for a failure, as well as a reordering of those elements that avoids the failure. To build a DSP, Symbiosis first generates a full, failing, multithreaded schedule via thread path profiling and symbolic constraint solving. Symbiosis selectively reorders events in the failing schedule to produce a nonfailing, alternate schedule. A DSP reports the ordering and dataflow differences between the failing and nonfailing schedules. Our evaluation on buggy real-world software and benchmarks shows that, in practical time, Symbiosis generates DSPs that both isolate the small fraction of event orders and dataflows responsible for the failure and report which event reorderings prevent failing. In our experiments, DSPs contain 90\% fewer events and 96\% fewer dataflows than the full failure-inducing schedules. We also conducted a user study that shows that, by allowing developers to focus on only a few events, DSPs reduce the amount of time required to understand the bug's root cause and find a valid fix.", acknowledgement = ack-nhfb, articleno = "14", fjournal = "ACM Transactions on Software Engineering and Methodology", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J790", } @Article{Marino:2016:DXU, author = "Daniel Marino and Abhayendra Singh and Todd Millstein and Madanlal Musuvathi and Satish Narayanasamy", title = "{drf x}: an Understandable, High Performance, and Flexible Memory Model for Concurrent Languages", journal = j-TOPLAS, volume = "38", number = "4", pages = "16:1--16:??", month = oct, year = "2016", CODEN = "ATPSDT", DOI = "https://doi.org/10.1145/2925988", ISSN = "0164-0925 (print), 1558-4593 (electronic)", ISSN-L = "0164-0925", bibdate = "Tue Oct 18 11:41:44 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/toplas/; https://www.math.utah.edu/pub/tex/bib/java2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/toplas.bib", abstract = "The most intuitive memory model for shared-memory multi-threaded programming is sequential consistency (SC), but it disallows the use of many compiler and hardware optimizations and thus affects performance. Data-race-free (DRF) models, such as the C++11 memory model, guarantee SC execution for data-race-free programs. But these models provide no guarantee at all for racy programs, compromising the safety and debuggability of such programs. To address the safety issue, the Java memory model, which is also based on the DRF model, provides a weak semantics for racy executions. However, this semantics is subtle and complex, making it difficult for programmers to reason about their programs and for compiler writers to ensure the correctness of compiler optimizations. We present the drf x memory model, which is simple for programmers to understand and use while still supporting many common optimizations. We introduce a memory model (MM) exception that can be signaled to halt execution. If a program executes without throwing this exception, then drf x guarantees that the execution is SC. If a program throws an MM exception during an execution, then drf x guarantees that the program has a data race. We observe that SC violations can be detected in hardware through a lightweight form of conflict detection. Furthermore, our model safely allows aggressive compiler and hardware optimizations within compiler-designated program regions. We formalize our memory model, prove several properties of this model, describe a compiler and hardware design suitable for drf x, and evaluate the performance overhead due to our compiler and hardware requirements.", acknowledgement = ack-nhfb, articleno = "16", fjournal = "ACM Transactions on Programming Languages and Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783", } @Article{Marinov:2016:PAF, author = "Martin Marinov and Nicholas Nash and David Gregg", title = "Practical Algorithms for Finding Extremal Sets", journal = j-ACM-J-EXP-ALGORITHMICS, volume = "21", number = "1", pages = "1.9:1--1.9:??", month = nov, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2893184", ISSN = "1084-6654", bibdate = "Fri Nov 4 16:46:55 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/jea.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The minimal sets within a collection of sets are defined as the ones that do not have a proper subset within the collection, and the maximal sets are the ones that do not have a proper superset within the collection. Identifying extremal sets is a fundamental problem with a wide range of applications in SAT solvers, data mining, and social network analysis. In this article, we present two novel improvements of the high-quality extremal set identification algorithm, AMS-Lex, described by Bayardo and Panda. The first technique uses memoization to improve the execution time of the single-threaded variant of the AMS-Lex, while our second improvement uses parallel programming methods. In a subset of the presented experiments, our memoized algorithm executes more than 400 times faster than the highly efficient publicly available implementation of AMS-Lex. Moreover, we show that our modified algorithm's speedup is not bounded above by a constant and that it increases as the length of the common prefixes in successive input itemsets increases. We provide experimental results using both real-world and synthetic datasets, and show our multithreaded variant algorithm outperforming AMS-Lex by 3 to 6 times. We find that on synthetic input datasets, when executed using 16 CPU cores of a 32-core machine, our multithreaded program executes about as fast as the state-of-the-art parallel GPU-based program using an NVIDIA GTX 580 graphics processing unit.", acknowledgement = ack-nhfb, articleno = "1.9", fjournal = "Journal of Experimental Algorithmics (JEA)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J430", } @Article{Moreland:2016:VMA, author = "Kenneth Moreland and Christopher Sewell and William Usher and Li-ta Lo and Jeremy Meredith and David Pugmire and James Kress and Hendrik Schroots and Kwan-Liu Ma and Hank Childs and Matthew Larsen and Chun-Ming Chen and Robert Maynard and Berk Geveci", title = "{VTK-m}: Accelerating the Visualization Toolkit for Massively Threaded Architectures", journal = j-IEEE-CGA, volume = "36", number = "3", pages = "48--58", month = may # "\slash " # jun, year = "2016", CODEN = "ICGADZ", ISSN = "0272-1716 (print), 1558-1756 (electronic)", ISSN-L = "0272-1716", bibdate = "Wed Oct 5 07:24:20 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecga.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "https://www.computer.org/csdl/mags/cg/2016/03/mcg2016030048-abs.html", acknowledgement = ack-nhfb, journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=38", } @Article{Narayanaswamy:2016:VCA, author = "Ganesh Narayanaswamy and Saurabh Joshi and Daniel Kroening", title = "The virtues of conflict: analysing modern concurrency", journal = j-SIGPLAN, volume = "51", number = "8", pages = "25:1--25:??", month = aug, year = "2016", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/3016078.2851165", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sat Sep 16 10:18:12 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "Modern shared memory multiprocessors permit reordering of memory operations for performance reasons. These reorderings are often a source of subtle bugs in programs written for such architectures. Traditional approaches to verify weak memory programs often rely on interleaving semantics, which is prone to state space explosion, and thus severely limits the scalability of the analysis. In recent times, there has been a renewed interest in modelling dynamic executions of weak memory programs using partial orders. However, such an approach typically requires ad-hoc mechanisms to correctly capture the data and control-flow choices/conflicts present in real-world programs. In this work, we propose a novel, conflict-aware, composable, truly concurrent semantics for programs written using C/C++ for modern weak memory architectures. We exploit our symbolic semantics based on general event structures to build an efficient decision procedure that detects assertion violations in bounded multi-threaded programs. Using a large, representative set of benchmarks, we show that our conflict-aware semantics outperforms the state-of-the-art partial-order based approaches.", acknowledgement = ack-nhfb, articleno = "25", fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "PPoPP '16 conference proceedings.", } @Article{Nogueira:2016:BBW, author = "David Nogueira and Pedro Tomas and Nuno Roma", title = "{BowMapCL}: {Burrows--Wheeler} Mapping on Multiple Heterogeneous Accelerators", journal = j-TCBB, volume = "13", number = "5", pages = "926--938", month = sep, year = "2016", CODEN = "ITCBCY", DOI = "https://doi.org/10.1109/TCBB.2015.2495149", ISSN = "1545-5963 (print), 1557-9964 (electronic)", ISSN-L = "1545-5963", bibdate = "Fri Dec 30 16:19:30 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/tcbb.bib", abstract = "The computational demand of exact-search procedures has pressed the exploitation of parallel processing accelerators to reduce the execution time of many applications. However, this often imposes strict restrictions in terms of the problem size and implementation efforts, mainly due to their possibly distinct architectures. To circumvent this limitation, a new exact-search alignment tool BowMapCL based on the Burrows--Wheeler Transform and FM-Index is presented. Contrasting to other alternatives, BowMapCL is based on a unified implementation using OpenCL, allowing the exploitation of multiple and possibly different devices e.g., NVIDIA, AMD/ATI, and Intel GPUs/APUs. Furthermore, to efficiently exploit such heterogeneous architectures, BowMapCL incorporates several techniques to promote its performance and scalability, including multiple buffering, work-queue task-distribution, and dynamic load-balancing, together with index partitioning, bit-encoding, and sampling. When compared with state-of-the-art tools, the attained results showed that BowMapCL using a single GPU is $ 2 \times $ to $ 7.5 \times $ faster than mainstream multi-threaded CPU BWT-based aligners, like Bowtie, BWA, and SOAP2; and up to $ 4 \times $ faster than the best performing state-of-the-art GPU implementations namely, SOAP3 and HPG-BWT. When multiple and completely distinct devices are considered, BowMapCL efficiently scales the offered throughput, ensuring a convenient load-balance of the involved processing in the several distinct devices.", acknowledgement = ack-nhfb, fjournal = "IEEE/ACM Transactions on Computational Biology and Bioinformatics", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J954", } @Article{Norris:2016:PAM, author = "Brian Norris and Brian Demsky", title = "A Practical Approach for Model Checking {C\slash C++11} Code", journal = j-TOPLAS, volume = "38", number = "3", pages = "10:1--10:??", month = may, year = "2016", CODEN = "ATPSDT", DOI = "https://doi.org/10.1145/2806886", ISSN = "0164-0925 (print), 1558-4593 (electronic)", ISSN-L = "0164-0925", bibdate = "Mon May 2 16:24:58 MDT 2016", bibsource = "http://www.acm.org/pubs/contents/journals/toplas/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/toplas.bib", abstract = "Writing low-level concurrent software has traditionally required intimate knowledge of the entire toolchain and often has involved coding in assembly. New language standards have extended C and C++ with support for low-level atomic operations and a weak memory model, enabling developers to write portable and efficient multithreaded code. In this article, we present CDSChecker, a tool for exhaustively exploring the behaviors of concurrent code under the C/C++ memory model. We have used CDSChecker to exhaustively unit test concurrent data structure implementations and have discovered errors in a published implementation of a work-stealing queue and a single producer, single consumer queue.", acknowledgement = ack-nhfb, articleno = "10", fjournal = "ACM Transactions on Programming Languages and Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783", } @Article{Papadopoulos:2016:TAD, author = "Stavros Papadopoulos and Kushal Datta and Samuel Madden and Timothy Mattson", title = "The {TileDB} array data storage manager", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "4", pages = "349--360", month = nov, year = "2016", CODEN = "????", DOI = "https://doi.org/10.14778/3025111.3025117", ISSN = "2150-8097", bibdate = "Sat Feb 25 09:01:51 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "We present a novel storage manager for multi-dimensional arrays that arise in scientific applications, which is part of a larger scientific data management system called TileDB. In contrast to existing solutions, TileDB is optimized for both dense and sparse arrays. Its key idea is to organize array elements into ordered collections called fragments. Each fragment is dense or sparse, and groups contiguous array elements into data tiles of fixed capacity. The organization into fragments turns random writes into sequential writes, and, coupled with a novel read algorithm, leads to very efficient reads. TileDB enables parallelization via multi-threading and multi-processing, offering thread-/process-safety and atomicity via lightweight locking. We show that TileDB delivers comparable performance to the HDF5 dense array storage manager, while providing much faster random writes. We also show that TileDB offers substantially faster reads and writes than the SciDB array database system with both dense and sparse arrays. Finally, we demonstrate that TileDB is considerably faster than adaptations of the Vertica relational column-store for dense array storage management, and at least as fast for the case of sparse arrays.", acknowledgement = ack-nhfb, fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Park:2016:CJP, author = "Hyukwoo Park and Myungsu Cha and Soo-Mook Moon", title = "Concurrent {JavaScript} Parsing for Faster Loading of {Web} Apps", journal = j-TACO, volume = "13", number = "4", pages = "41:1--41:??", month = dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/3004281", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 28 16:24:46 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "JavaScript is a dynamic language mainly used as a client-side web script. Nowadays, web is evolving into an application platform with its web apps, and JavaScript increasingly undertakes complex computations and interactive user interfaces, requiring a high-performance JavaScript engine. There have been many optimizations for efficient JavaScript engines, but one component that has not been optimized much is JavaScript parsing. A JavaScript function needs to be parsed before being executed, and the parsing overhead takes a substantial portion of JavaScript execution time for web apps, especially during app loading. This article proposes concurrent parsing of JavaScript, which performs the parsing of JavaScript functions in advance on different threads, while the main thread is executing the parsed JavaScript functions. This can hide the parsing overhead from the main execution thread, reducing the JavaScript execution time, thus reducing the overall app loading time. More specifically, we separated JavaScript parsing and made it run on different threads without violating the execution semantics of JavaScript. We also designed an efficient multi-threaded parsing architecture, which reduces the synchronization overhead and schedules the parsing requests appropriately. Finally, we explored two methods of choosing the target functions for concurrent parsing: one based on profiled information and the other based on speculative heuristics. We performed experiments on the WebKit browser with the JSC engine for real web apps. The result shows that the proposed concurrent parsing can improve the JavaScript performance during app loading by as much as 64\% and by 39.7\% on average. This improves the whole app loading performance tangibly, by as much as 32.7\% and by 18.2\%, on average.", acknowledgement = ack-nhfb, articleno = "41", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924", } @Article{Pusukuri:2016:TEL, author = "Kishore Kumar Pusukuri and Rajiv Gupta and Laxmi N. Bhuyan", title = "{Tumbler}: an Effective Load-Balancing Technique for Multi-{CPU} Multicore Systems", journal = j-TACO, volume = "12", number = "4", pages = "36:1--36:??", month = jan, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2827698", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Feb 16 15:36:38 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Schedulers used by modern OSs (e.g., Oracle Solaris 11{\trademark} and GNU/Linux) balance load by balancing the number of threads in run queues of different cores. While this approach is effective for a single CPU multicore system, we show that it can lead to a significant load imbalance across CPUs of a multi-CPU multicore system. Because different threads of a multithreaded application often exhibit different levels of CPU utilization, load cannot be measured in terms of the number of threads alone. We propose Tumbler that migrates the threads of a multithreaded program across multiple CPUs to balance the load across the CPUs. While Tumbler distributes the threads equally across the CPUs, its assignment of threads to CPUs is aimed at minimizing the variation in utilization of different CPUs to achieve load balance. We evaluated Tumbler using a wide variety of 35 multithreaded applications, and our experimental results show that Tumbler outperforms both Oracle Solaris 11{\trademark} and GNU/Linux.", acknowledgement = ack-nhfb, articleno = "36", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924", } @Article{Qian:2016:EFS, author = "Junjie Qian and Witawas Srisa-an and Sharad Seth and Hong Jiang and Du Li and Pan Yi", title = "Exploiting {FIFO} Scheduler to Improve Parallel Garbage Collection Performance", journal = j-SIGPLAN, volume = "51", number = "7", pages = "109--121", month = jul, year = "2016", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/3007611.2892248", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sat Sep 16 10:18:12 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "Recent studies have found that parallel garbage collection performs worse with more CPUs and more collector threads. As part of this work, we further investigate this phenomenon and find that poor scalability is worst in highly scalable Java applications. Our investigation to find the causes clearly reveals that efficient multi-threading in an application can prolong the average object lifespan, which results in less effective garbage collection. We also find that prolonging lifespan is the direct result of Linux's Completely Fair Scheduler due to its round-robin like behavior that can increase the heap contention between the application threads. Instead, if we use pseudo first-in-first-out to schedule application threads in large multicore systems, the garbage collection scalability is significantly improved while the time spent in garbage collection is reduced by as much as 21\%. The average execution time of the 24 Java applications used in our study is also reduced by 11\%. Based on this observation, we propose two approaches to optimally select scheduling policies based on application scalability profile. Our first approach uses the profile information from one execution to tune the subsequent executions. Our second approach dynamically collects profile information and performs policy selection during execution.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "VEE '16 conference proceedings.", } @Article{Qian:2016:ODG, author = "Xuehai Qian and Koushik Sen and Paul Hargrove and Costin Iancu", title = "{OPR}: deterministic group replay for one-sided communication", journal = j-SIGPLAN, volume = "51", number = "8", pages = "47:1--47:??", month = aug, year = "2016", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/3016078.2851179", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sat Sep 16 10:18:12 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "The ability to reproduce a parallel execution is desirable for debugging and program reliability purposes. In debugging (13), the programmer needs to manually step back in time, while for resilience (6) this is automatically performed by the application upon failure. To be useful, replay has to faithfully reproduce the original execution. For parallel programs the main challenge is inferring and maintaining the order of conflicting operations (data races). Deterministic record and replay (R{\&}R) techniques have been developed for multithreaded shared memory programs (5), as well as distributed memory programs (14). Our main interest is techniques for large scale scientific (3; 4) programming models.", acknowledgement = ack-nhfb, articleno = "47", fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "PPoPP '16 conference proceedings.", } @Article{Radojkovic:2016:TAM, author = "P. Radojkovic and P. M. Carpenter and M. Moreto and V. Cakarevic and J. Verdu and A. Pajuelo and F. J. Cazorla and M. Nemirovsky and M. Valero", title = "Thread Assignment in Multicore\slash Multithreaded Processors: A Statistical Approach", journal = j-IEEE-TRANS-COMPUT, volume = "65", number = "1", pages = "256--269", month = "????", year = "2016", CODEN = "ITCOB4", DOI = "https://doi.org/10.1109/TC.2015.2417533", ISSN = "0018-9340 (print), 1557-9956 (electronic)", ISSN-L = "0018-9340", bibdate = "Tue Dec 15 09:36:24 MST 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Computers", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12", } @Article{Samak:2016:DSF, author = "Malavika Samak and Omer Tripp and Murali Krishna Ramanathan", title = "Directed synthesis of failing concurrent executions", journal = j-SIGPLAN, volume = "51", number = "10", pages = "430--446", month = oct, year = "2016", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/3022671.2984040", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sat Sep 16 10:18:13 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/java2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "Detecting concurrency-induced bugs in multithreaded libraries can be challenging due to the intricacies associated with their manifestation. This includes invocation of multiple methods, synthesis of inputs to the methods to reach the failing location, and crafting of thread interleavings that cause the erroneous behavior. Neither fuzzing-based testing techniques nor over-approximate static analyses are well positioned to detect such subtle defects while retaining high accuracy alongside satisfactory coverage. In this paper, we propose a directed, iterative and scalable testing engine that combines the strengths of static and dynamic analysis to help synthesize concurrent executions to expose complex concurrency-induced bugs. Our engine accepts as input the library, its client (either sequential or concurrent) and a specification of correctness. Then, it iteratively refines the client to generate an execution that can break the input specification. Each step of the iterative process includes statically identifying sub-goals towards the goal of failing the specification, generating a plan toward meeting these goals, and merging of the paths traversed dynamically with the plan computed statically via constraint solving to generate a new client. The engine reports full reproduction scenarios, guaranteed to be true, for the bugs it finds. We have created a prototype of our approach named MINION. We validated MINION by applying it to well-tested concurrent classes from popular Java libraries, including the latest versions of OpenJDK and Google-Guava. We were able to detect 31 real crashes across 10 classes in a total of 23 minutes, including previously unknown bugs. Comparison with three other tools reveals that combined, they report only 9 of the 31 crashes (and no other crashes beyond MINION). This is because several of these bugs manifest under deeply nested path conditions (observed maximum of 11), deep nesting of method invocations (observed maximum of 6) and multiple refinement iterations to generate the crash-inducing client.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "OOPSLA '16 conference proceedings.", } @Article{Sleiman:2016:ESO, author = "Faissal M. Sleiman and Thomas F. Wenisch", title = "Efficiently scaling out-of-order cores for simultaneous multithreading", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "431--443", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001183", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "Simultaneous multithreading (SMT) out-of-order cores waste a significant portion of structural out-of-order core resources on instructions that do not need them. These resources eliminate false ordering dependences. However, because thread interleaving spreads dependent instructions, nearly half of instructions dynamically issue in program order after all false dependences have resolved. These in-sequence instructions interleave with other reordered instructions at a fine granularity within the instruction window. We develop a technique to efficiently scale in-flight instructions through a hybrid out-of-order/in-order microarchitecture, which can dispatch instructions to efficient in-order scheduling mechanisms---using a FIFO issue queue called the shelf ---on an instruction-by-instruction basis. Instructions dispatched to the shelf do not allocate out-of-order core resources in the reorder buffer, issue queue, physical registers, or load-store queues. We measure opportunity for such hybrid microarchitectures and design and evaluate a practical dispatch mechanism targeted at 4-threaded cores. Adding a shelf to a baseline 4-thread system with 64-entry ROB improves normalized system throughput by 11.5\% (up to 19.2\% at best) and energy-delay product by 10.9\% (up to 17.5\% at best).", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", remark = "ISCA '16 conference proceedings.", } @Article{Tian:2016:ETR, author = "Zhenzhou Tian and Ting Liu and Qinghua Zheng and Ming Fan and Eryue Zhuang and Zijiang Yang", title = "Exploiting thread-related system calls for plagiarism detection of multithreaded programs", journal = j-J-SYST-SOFTW, volume = "119", number = "??", pages = "136--148", month = sep, year = "2016", CODEN = "JSSODM", ISSN = "0164-1212 (print), 1873-1228 (electronic)", ISSN-L = "0164-1212", bibdate = "Sat Jul 16 18:10:04 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/jsystsoftw.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.sciencedirect.com/science/article/pii/S0164121216300838", acknowledgement = ack-nhfb, fjournal = "Journal of Systems and Software", journal-URL = "http://www.sciencedirect.com/science/journal/01641212/", } @Article{Vale:2016:PDT, author = "Tiago M. Vale and Jo{\~a}o A. Silva and Ricardo J. Dias and Jo{\~a}o M. Louren{\c{c}}o", title = "{Pot}: Deterministic Transactional Execution", journal = j-TACO, volume = "13", number = "4", pages = "52:1--52:??", month = dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/3017993", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 28 16:24:46 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "This article presents Pot, a system that leverages the concept of preordered transactions to achieve deterministic multithreaded execution of programs that use Transactional Memory. Preordered transactions eliminate the root cause of nondeterminism in transactional execution: they provide the illusion of executing in a deterministic serial order, unlike traditional transactions that appear to execute in a nondeterministic order that can change from execution to execution. Pot uses a new concurrency control protocol that exploits the serialization order to distinguish between fast and speculative transaction execution modes in order to mitigate the overhead of imposing a deterministic order. We build two Pot prototypes: one using STM and another using off-the-shelf HTM. To the best of our knowledge, Pot enables deterministic execution of programs using off-the-shelf HTM for the first time. An experimental evaluation shows that Pot achieves deterministic execution of TM programs with low overhead, sometimes even outperforming nondeterministic executions, and clearly outperforming the state of the art.", acknowledgement = ack-nhfb, articleno = "52", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924", } @Article{VanZee:2016:BFE, author = "Field G. {Van Zee} and Tyler M. Smith and Bryan Marker and Tze Meng Low and Robert A. {Van De Geijn} and Francisco D. Igual and Mikhail Smelyanskiy and Xianyi Zhang and Michael Kistler and Vernon Austel and John A. Gunnels and Lee Killough", title = "The {BLIS} Framework: Experiments in Portability", journal = j-TOMS, volume = "42", number = "2", pages = "12:1--12:19", month = jun, year = "2016", CODEN = "ACMSCU", DOI = "https://doi.org/10.1145/2755561", ISSN = "0098-3500 (print), 1557-7295 (electronic)", ISSN-L = "0098-3500", bibdate = "Fri Jun 3 18:52:21 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/toms.bib", abstract = "BLIS is a new software framework for instantiating high-performance BLAS-like dense linear algebra libraries. We demonstrate how BLIS acts as a productivity multiplier by using it to implement the level-3 BLAS on a variety of current architectures. The systems for which we demonstrate the framework include state-of-the-art general-purpose, low-power, and many-core architectures. We show, with very little effort, how the BLIS framework yields sequential and parallel implementations that are competitive with the performance of ATLAS, OpenBLAS (an effort to maintain and extend the GotoBLAS), and commercial vendor implementations such as AMD's ACML, IBM's ESSL, and Intel's MKL libraries. Although most of this article focuses on single-core implementation, we also provide compelling results that suggest the framework's leverage extends to the multithreaded domain.", acknowledgement = ack-nhfb, articleno = "12", fjournal = "ACM Transactions on Mathematical Software (TOMS)", journal-URL = "http://dl.acm.org/pub.cfm?id=J782", } @Article{Verdu:2016:PSA, author = "Javier Verdu and Alex Pajuelo", title = "Performance Scalability Analysis of {JavaScript} Applications with {Web Workers}", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "15", number = "2", pages = "105--108", month = jul # "\slash " # dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2015.2494585", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Jun 20 17:18:18 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/java2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Web applications are getting closer to the performance of native applications taking advantage of new standard-based technologies. The recent HTML5 standard includes, among others, the Web Workers API that allows executing JavaScript applications on multiple threads, or workers. However, the internals of the browser's JavaScript virtual machine does not expose direct relation between workers and running threads in the browser and the utilization of logical cores in the processor. As a result, developers do not know how performance actually scales on different environments and therefore what is the optimal number of workers on parallel JavaScript codes. This paper presents the first performance scalability analysis of parallel web apps with multiple workers. We focus on two case studies representative of different worker execution models. Our analyses show performance scaling on different parallel processor microarchitectures and on three major web browsers in the market. Besides, we study the impact of co-running applications on the web app performance. The results provide insights for future approaches to automatically find out the optimal number of workers that provide the best tradeoff between performance and resource usage to preserve system responsiveness and user experience, especially on environments with unexpected changes on system workload.", acknowledgement = ack-nhfb, affiliation = "Verdu, J (Reprint Author), BarcelonaTECH UPC, Dept Comp Architecture, Barcelona, Spain. Verdu, Javier; Pajuelo, Alex, BarcelonaTECH UPC, Dept Comp Architecture, Barcelona, Spain.", author-email = "jverdu@ac.upc.edu mpajuelo@ac.upc.edu", da = "2019-06-20", doc-delivery-number = "EH9MM", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Spanish Ministry of Economy and Competitiveness (MINECO) [TIN2012-34557]", funding-text = "This work has been supported by the Spanish Ministry of Economy and Competitiveness (MINECO) under contract TIN2012-34557.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "HTML5; javascript; multithreading; parallelism; web apps; web workers", number-of-cited-references = "12", oa = "Green Published", ORCID-numbers = "Pajuelo, Alex/0000-0002-5510-6860 Verdu Mula, Javier/0000-0003-4485-2419", research-areas = "Computer Science", times-cited = "1", unique-id = "Verdu:2016:PSA", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Yao:2016:OCO, author = "Yuan Yao and Zhonghai Lu", title = "Opportunistic competition overhead reduction for expediting critical section in {NoC} based {CMPs}", journal = j-COMP-ARCH-NEWS, volume = "44", number = "3", pages = "279--290", month = jun, year = "2016", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3007787.3001167", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 12 18:43:43 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "With the degree of parallelism increasing, performance of multi-threaded shared variable applications is not only limited by serialized critical section execution, but also by the serialized competition overhead for threads to get access to critical section. As the number of concurrent threads grows, such competition overhead may exceed the time spent in critical section itself, and become the dominating factor limiting the performance of parallel applications. In modern operating systems, queue spinlock, which comprises a low-overhead spinning phase and a high-overhead sleeping phase, is often used to lock critical sections. In the paper, we show that this advanced locking solution may create very high competition overhead for multithreaded applications executing in NoC-based CMPs. Then we propose a software-hardware cooperative mechanism that can opportunistically maximize the chance that a thread wins the critical section access in the low-overhead spinning phase, thereby reducing the competition overhead. At the OS primitives level, we monitor the remaining times of retry (RTR) in a thread's spinning phase, which reflects in how long the thread must enter into the high-overhead sleep mode. At the hardware level, we integrate the RTR information into the packets of locking requests, and let the NoC prioritize locking request packets according to the RTR information. The principle is that the smaller RTR a locking request packet carries, the higher priority it gets and thus quicker delivery. We evaluate our opportunistic competition overhead reduction technique with cycle-accurate full-system simulations in GEM5 using PARSEC (11 programs) and SPEC OMP2012 (14 programs) benchmarks. Compared to the original queue spinlock implementation, experimental results show that our method can effectively increase the opportunity of threads entering the critical section in low-overhead spinning phase, reducing the competition overhead averagely by 39.9\% (maximally by 61.8\%) and accelerating the execution of the Region-of-Interest averagely by 14.4\% (maximally by 24.5\%) across all 25 benchmark programs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", remark = "ISCA '16 conference proceedings.", } @Article{Yiapanis:2016:CDS, author = "Paraskevas Yiapanis and Gavin Brown and Mikel Luj{\'a}n", title = "Compiler-Driven Software Speculation for Thread-Level Parallelism", journal = j-TOPLAS, volume = "38", number = "2", pages = "5:1--5:??", month = jan, year = "2016", CODEN = "ATPSDT", DOI = "https://doi.org/10.1145/2821505", ISSN = "0164-0925 (print), 1558-4593 (electronic)", ISSN-L = "0164-0925", bibdate = "Tue Jan 5 16:31:06 MST 2016", bibsource = "http://www.acm.org/pubs/contents/journals/toplas/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/toplas.bib", abstract = "Current parallelizing compilers can tackle applications exercising regular access patterns on arrays or affine indices, where data dependencies can be expressed in a linear form. Unfortunately, there are cases that independence between statements of code cannot be guaranteed and thus the compiler conservatively produces sequential code. Programs that involve extensive pointer use, irregular access patterns, and loops with unknown number of iterations are examples of such cases. This limits the extraction of parallelism in cases where dependencies are rarely or never triggered at runtime. Speculative parallelism refers to methods employed during program execution that aim to produce a valid parallel execution schedule for programs immune to static parallelization. The motivation for this article is to review recent developments in the area of compiler-driven software speculation for thread-level parallelism and how they came about. The article is divided into two parts. In the first part the fundamentals of speculative parallelization for thread-level parallelism are explained along with a design choice categorization for implementing such systems. Design choices include the ways speculative data is handled, how data dependence violations are detected and resolved, how the correct data are made visible to other threads, or how speculative threads are scheduled. The second part is structured around those design choices providing the advances and trends in the literature with reference to key developments in the area. Although the focus of the article is in software speculative parallelization, a section is dedicated for providing the interested reader with pointers and references for exploring similar topics such as hardware thread-level speculation, transactional memory, and automatic parallelization.", acknowledgement = ack-nhfb, articleno = "5", fjournal = "ACM Transactions on Programming Languages and Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783", } @Article{Yu:2016:DLR, author = "Hairong Yu and Guohui Li and Jianjun Li and Lihchyun Shu", title = "{DO$_{\rm cyclical}$}: a Latency-Resistant Cyclic Multi-Threading Approach for Automatic Program Parallelization", journal = j-COMP-J, volume = "59", number = "8", pages = "1155--1173", month = aug, year = "2016", CODEN = "CMPJA6", DOI = "https://doi.org/10.1093/comjnl/bxv125", ISSN = "0010-4620 (print), 1460-2067 (electronic)", ISSN-L = "0010-4620", bibdate = "Tue Aug 30 07:10:50 MDT 2016", bibsource = "http://comjnl.oxfordjournals.org/content/59/8.toc; https://www.math.utah.edu/pub/tex/bib/compj2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://comjnl.oxfordjournals.org/content/59/8/1155", acknowledgement = ack-nhfb, fjournal = "Computer Journal", journal-URL = "http://comjnl.oxfordjournals.org/", onlinedate = "January 14, 2016", } @Article{Zhang:2016:SAN, author = "Mingzhe Zhang and Francis C. M. Lau and Cho-Li Wang and Luwei Cheng and Haibo Chen", title = "Scalable adaptive {NUMA}-aware lock: combining local locking and remote locking for efficient concurrency", journal = j-SIGPLAN, volume = "51", number = "8", pages = "50:1--50:??", month = aug, year = "2016", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/3016078.2851176", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sat Sep 16 10:18:12 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "Scalable locking is a key building block for scalable multi-threaded software. Its performance is especially critical in multi-socket, multi-core machines with non-uniform memory access (NUMA). Previous schemes such as local locking and remote locking only perform well under a certain level of contention, and often require non-trivial tuning for a particular configuration. Besides, for large NUMA systems, because of unmanaged lock server's nomination, current distance-first NUMA policies cannot perform satisfactorily. In this work, we propose SANL, a locking scheme that can deliver high performance under various contention levels by adaptively switching between the local and the remote lock scheme. Furthermore, we introduce a new NUMA policy for the remote lock that jointly considers node distances and server utilization when choosing lock servers. A comparison with seven representative locking schemes shows that SANL outperforms the others in most contention situations. In one group test, SANL is 3.7 times faster than RCL lock and 17 times faster than POSIX mutex.", acknowledgement = ack-nhfb, articleno = "50", fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "PPoPP '16 conference proceedings.", } @Article{Zhang:2016:TED, author = "Tong Zhang and Dongyoon Lee and Changhee Jung", title = "{TxRace}: Efficient Data Race Detection Using Commodity Hardware Transactional Memory", journal = j-OPER-SYS-REV, volume = "50", number = "2", pages = "159--173", month = jun, year = "2016", CODEN = "OSRED8", DOI = "https://doi.org/10.1145/2954680.2872384", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Thu Jun 9 17:03:34 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/opersysrev.bib", abstract = "Detecting data races is important for debugging shared-memory multithreaded programs, but the high runtime overhead prevents the wide use of dynamic data race detectors. This paper presents TxRace, a new software data race detector that leverages commodity hardware transactional memory (HTM) to speed up data race detection. TxRace instruments a multithreaded program to transform synchronization-free regions into transactions, and exploits the conflict detection mechanism of HTM for lightweight data race detection at runtime. However, the limitations of the current best-effort commodity HTMs expose several challenges in using them for data race detection: (1) lack of ability to pinpoint racy instructions, (2) false positives caused by cache line granularity of conflict detection, and (3) transactional aborts for non-conflict reasons (e.g., capacity or unknown). To overcome these challenges, TxRace performs lightweight HTM-based data race detection at first, and occasionally switches to slow yet precise data race detection only for the small fraction of execution intervals in which potential races are reported by HTM. According to the experimental results, TxRace reduces the average runtime overhead of dynamic data race detection from 11.68x to 4.65x with only a small number of false negatives.", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J597", } @Article{Areias:2017:SDP, author = "Miguel Areias and Ricardo Rocha", title = "On scaling dynamic programming problems with a multithreaded tabling {Prolog} system", journal = j-J-SYST-SOFTW, volume = "125", number = "??", pages = "417--426", month = mar, year = "2017", CODEN = "JSSODM", ISSN = "0164-1212 (print), 1873-1228 (electronic)", ISSN-L = "0164-1212", bibdate = "Sat Feb 4 12:20:39 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/jsystsoftw.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "//www.sciencedirect.com/science/article/pii/S0164121216300929", acknowledgement = ack-nhfb, fjournal = "Journal of Systems and Software", journal-URL = "http://www.sciencedirect.com/science/journal/01641212/", } @Article{Arteaga:2017:GFG, author = "Jaime Arteaga and St{\'e}phane Zuckerman and Guang R. Gao", title = "Generating Fine-Grain Multithreaded Applications Using a Multigrain Approach", journal = j-TACO, volume = "14", number = "4", pages = "47:1--47:??", month = dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3155288", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Dec 22 18:25:55 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/pvm.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The recent evolution in hardware landscape, aimed at producing high-performance computing systems capable of reaching extreme-scale performance, has reignited the interest in fine-grain multithreading, particularly at the intranode level. Indeed, popular parallel programming environments, such as OpenMP, which features a simple interface for the parallelization of programs, are now incorporating fine-grain constructs. However, since coarse-grain directives are still heavily used, the OpenMP runtime is forced to support both coarse- and fine-grain models of execution, potentially reducing the advantages obtained when executing an application in a fully fine-grain environment. To evaluate the type of applications that benefit from executing in a unified fine-grain program execution model, this article presents a multigrain parallel programming environment for the generation of fine-grain multithreaded applications from programs featuring OpenMP's API, allowing OpenMP programs to be run on top of a fine-grain event-driven program execution model. Experimental results with five scientific benchmarks show that fine-grain applications, generated by and run on our environment with two runtimes implementing a fine-grain event-driven program execution model, are competitive and can outperform their OpenMP counterparts, especially for data-intensive workloads with irregular and dynamic parallelism, reaching speedups as high as 2.6$ \times $ for Graph500 and 51$ \times $ for NAS Data Cube.", acknowledgement = ack-nhfb, articleno = "47", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924", } @Article{Bender:2017:TLM, author = "Michael A. Bender and Jonathan W. Berry and Simon D. Hammond and K. Scott Hemmert and Samuel McCauley and Branden Moore and Benjamin Moseley and Cynthia A. Phillips and David Resnick and Arun Rodrigues", title = "Two-level main memory co-design: Multi-threaded algorithmic primitives, analysis, and simulation", journal = j-J-PAR-DIST-COMP, volume = "102", number = "??", pages = "213--228", month = apr, year = "2017", CODEN = "JPDCER", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Wed Jan 25 14:20:18 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.sciencedirect.com/science/article/pii/S074373151630185X", acknowledgement = ack-nhfb, fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315/", } @Book{Blandy:2017:PR, author = "Jim Blandy and Jason Orendorff", title = "Programming {Rust}", publisher = pub-ORA-MEDIA, address = pub-ORA-MEDIA:adr, pages = "xx + 598", year = "2017", ISBN = "1-4919-2728-3 (paperback), 1-4919-2727-5, 1-4919-2723-2 (e-book), 1-4919-2725-9 (e-book)", ISBN-13 = "978-1-4919-2728-1 (paperback), 978-1-4919-2727-4, 978-1-4919-2723-6 (e-book), 978-1-4919-2725-0 (e-book)", LCCN = "QA76.73.R88 B53 2017", bibdate = "Mon Dec 9 15:37:10 MST 2019", bibsource = "fsz3950.oclc.org:210/WorldCat; https://www.math.utah.edu/pub/tex/bib/master.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/ora.bib", URL = "http://proquest.safaribooksonline.com/9781491927274", abstract = "Rust is a new systems programming language that combines the performance and low-level control of C and C++ with memory safety and thread safety. Rust's modern, flexible types ensure your program is free of null pointer dereferences, double frees, dangling pointers, and similar bugs, all at compile time, without runtime overhead. In multithreaded code, Rust catches data races at compile time, making concurrency much easier to use. Written by two experienced systems programmers, this book explains how Rust manages to bridge the gap between performance and safety, and how you can take advantage of it. Topics include: How Rust represents values in memory (with diagrams) Complete explanations of ownership, moves, borrows, and lifetimes Cargo, rustdoc, unit tests, and how to publish your code on crates.io, Rust's public package repository High-level features like generic code, closures, collections, and iterators that make Rust productive and flexible Concurrency in Rust: threads, mutexes, channels, and atomics, all much safer to use than in C or C++ Unsafe code, and how to preserve the integrity of ordinary code that uses it. Extended examples illustrating how pieces of the language fit together.", acknowledgement = ack-nhfb, libnote = "Not in my library.", subject = "UNIX (Computer file); UNIX (Computer file); C (Computer program language); Text editors (Computer programs); Software engineering; C (Computer program language); Software engineering.; Text editors (Computer programs)", tableofcontents = "Preface \\ Who Should Read This Book \\ Why We Wrote This Book \\ Navigating This Book \\ Conventions Used in This Book \\ Using Code Examples \\ O Reilly Safari \\ How to Contact Us \\ Acknowledgments \\ 1. Why Rust? \\ Type Safety \\ 2. A Tour of Rust \\ Downloading and Installing Rust \\ A Simple Function \\ Writing and Running Unit Tests \\ Handling Command-Line Arguments \\ A Simple Web Server \\ Concurrency \\ What the Mandelbrot Set Actually Is \\ Parsing Pair Command-Line Arguments \\ Mapping from Pixels to Complex Numbers \\ Plotting the Set \\ Writing Image Files \\ A Concurrent Mandelbrot Program \\ Running the Mandelbrot Plotter \\ Safety Is Invisible \\ 3. Basic Types \\ Machine Types \\ Integer Types \\ Floating-Point Types \\ The bool Type \\ Characters \\ Tuples \\ Pointer Types \\ References \\ Boxes \\ Raw Pointers \\ Arrays, Vectors, and Slices \\ Arrays \\ Vectors \\ Slices \\ String Types \\ String Literals \\ Byte Strings \\ Strings in Memory \\ String \\ Using Strings \\ Other String-Like Types \\ Beyond the Basics \\ 4. Ownership \\ Ownership \\ Moves \\ More Operations That Move \\ Moves and Control Flow \\ Moves and Indexed Content \\ Copy Types: The Exception to Moves \\ Rc and Arc: Shared Ownership \\ 5. References \\ References as Values \\ Rust References Versus C++ References \\ Assigning References \\ References to References \\ Comparing References \\ References Are Never Null \\ Borrowing References to Arbitrary Expressions \\ References to Slices and Trait Objects \\ Reference Safety \\ Borrowing a Local Variable \\ Receiving References as Parameters \\ Passing References as Arguments \\ Returning References \\ Structs Containing References \\ Distinct Lifetime Parameters \\ Omitting Lifetime Parameters \\ Sharing Versus Mutation \\ Taking Arms Against a Sea of Objects \\ 6. Expressions \\ An Expression Language \\ Blocks and Semicolons \\ Declarations \\ if and match \\ if let \\ Loops \\ return Expressions \\ Why Rust Has loop \\ Function and Method Calls \\ Fields and Elements \\ Reference Operators \\ Arithmetic, Bitwise, Comparison, and Logical Operators \\ Assignment \\ Type Casts \\ Closures \\ Precedence and Associativity \\ Onward \\ 7. Error Handling \\ Panic \\ Unwinding \\ Aborting \\ Result \\ Catching Errors \\ Result Type Aliases \\ Printing Errors \\ Propagating Errors \\ Working with Multiple Error Types \\ Dealing with Errors That Can t Happen \\ Ignoring Errors \\ Handling Errors in main() \\ Declaring a Custom Error Type \\ Why Results? \\ 8. Crates and Modules \\ Crates \\ Build Profiles \\ Modules \\ Modules in Separate Files \\ Paths and Imports \\ The Standard Prelude \\ Items, the Building Blocks of Rust \\ Turning a Program into a Library \\ The src/bin Directory \\ Attributes \\ Tests and Documentation \\ Integration Tests \\ Documentation \\ Doc-Tests \\ Specifying Dependencies \\ Versions \\ Cargo.lock \\ Publishing Crates to crates.io \\ Workspaces \\ More Nice Things \\ 9. Structs \\ Named-Field Structs \\ Tuple-Like Structs \\ Unit-Like Structs \\ Struct Layout \\ Defining Methods with impl \\ Generic Structs \\ Structs with Lifetime Parameters \\ Deriving Common Traits for Struct Types \\ Interior Mutability \\ 10. Enums and Patterns \\ Enums \\ Enums with Data \\ Enums in Memory \\ Rich Data Structures Using Enums \\ Generic Enums \\ Patterns \\ Literals, Variables, and Wildcards in Patterns \\ Tuple and Struct Patterns \\ Reference Patterns \\ Matching Multiple Possibilities \\ Pattern Guards \\ @ patterns \\ Where Patterns Are Allowed \\ Populating a Binary Tree \\ The Big Picture \\ 11. Traits and Generics \\ Using Traits \\ Trait Objects \\ Trait Object Layout \\ Generic Functions \\ Which to Use \\ Defining and Implementing Traits \\ Default Methods \\ Traits and Other People s Types \\ Self in Traits \\ Subtraits \\ Static Methods \\ Fully Qualified Method Calls \\ Traits That Define Relationships Between Types \\ Associated Types (or How Iterators Work) \\ Generic Traits (or How Operator Overloading Works) \\ Buddy Traits (or How rand::random() Works) \\ Reverse-Engineering Bounds \\ Conclusion \\ 12. Operator Overloading \\ Arithmetic and Bitwise Operators \\ Unary Operators \\ Binary Operators \\ Compound Assignment Operators \\ Equality Tests \\ Ordered Comparisons \\ Index and IndexMut \\ Other Operators \\ 13. Utility Traits \\ Drop \\ Sized \\ Clone \\ Copy \\ Deref and DerefMut \\ Default \\ AsRef and AsMut \\ Borrow and BorrowMut \\ From and Into \\ ToOwned \\ Borrow and ToOwned at Work: The Humble Cow \\ 14. Closures \\ Capturing Variables \\ Closures That Borrow \\ Closures That Steal \\ Function and Closure Types \\ Closure Performance \\ Closures and Safety \\ Closures That Kill \\ FnOnce \\ FnMut \\ Callbacks \\ Using Closures Effectively \\ 15. Iterators \\ The Iterator and IntoIterator Traits \\ Creating Iterators \\ iter and iter_mut Methods \\ IntoIterator Implementations \\ drain Methods \\ Other Iterator Sources \\ Iterator Adapters \\ map and filter \\ filter_map and flat_map \\ scan \\ take and take_while \\ skip and skip_while \\ peekable \\ fuse \\ Reversible Iterators and rev \\ inspect \\ chain \\ enumerate \\ zip \\ by_ref \\ cloned \\ cycle \\ Consuming Iterators \\ Simple Accumulation: count, sum, product \\ max, min \\ max_by, min_by \\ max_by_key, min_by_key \\ Comparing Item Sequences \\ any and all \\ position, rposition, and ExactSizeIterator \\ fold \\ nth \\ last \\ find \\ Building Collections: collect and FromIterator \\ The Extend Trait \\ partition \\ Implementing Your Own Iterators \\ 16. Collections \\ Overview \\ Vec \\ Accessing Elements \\ Iteration \\ Growing and Shrinking Vectors \\ Joining \\ Splitting \\ Swapping \\ Sorting and Searching \\ Comparing Slices \\ Random Elements \\ Rust Rules Out Invalidation Errors \\ VecDeque \\ LinkedList \\ BinaryHeap \\ HashMap and BTreeMap \\ Entries \\ Map Iteration \\ HashSet and BTreeSet \\ Set Iteration \\ When Equal Values Are Different \\ Whole-Set Operations \\ Hashing \\ Using a Custom Hashing Algorithm \\ Beyond the Standard Collections \\ 17. Strings and Text \\ Some Unicode Background \\ ASCII, Latin-1, and Unicode \\ UTF-8 \\ Text Directionality \\ Characters (char) \\ Classifying Characters \\ Handling Digits \\ Case Conversion for Characters \\ Conversions to and from Integers \\ String and str \\ Creating String Values \\ Simple Inspection \\ Appending and Inserting Text \\ Removing Text \\ Conventions for Searching and Iterating \\ Patterns for Searching Text \\ Searching and Replacing \\ Iterating over Text \\ Trimming \\ Case Conversion for Strings \\ Parsing Other Types from Strings \\ Converting Other Types to Strings \\ Borrowing as Other Text-Like Types \\ Accessing Text as UTF-8 \\ Producing Text from UTF-8 Data \\ Putting Off Allocation \\ Strings as Generic Collections \\ Formatting Values \\ Formatting Text Values \\ Formatting Numbers \\ Formatting Other Types \\ Formatting Values for Debugging \\ Formatting Pointers for Debugging \\ Referring to Arguments by Index or Name \\ Dynamic Widths and Precisions \\ Formatting Your Own Types \\ Using the Formatting Language in Your Own Code \\ Regular Expressions \\ Basic Regex Use \\ Building Regex Values Lazily \\ Normalization \\ Normalization Forms \\ The unicode-normalization Crate \\ 18. Input and Output \\ Readers and Writers \\ Readers \\ Buffered Readers \\ Reading Lines \\ Collecting Lines \\ Writers \\ Files \\ Seeking \\ Other Reader and Writer Types \\ Binary Data, Compression, and Serialization \\ Files and Directories \\ OsStr and Path \\ Path and PathBuf Methods \\ Filesystem Access Functions \\ Reading Directories \\ Platform-Specific Features \\ Networking \\ 19. Concurrency \\ Fork-Join Parallelism \\ spawn and join \\ Error Handling Across Threads \\ Sharing Immutable Data Across Threads \\ Rayon \\ Revisiting the Mandelbrot Set \\ Channels \\ Sending Values \\ Receiving Values \\ Running the Pipeline \\ Channel Features and Performance \\ Thread Safety: Send and Sync \\ Piping Almost Any Iterator to a Channel \\ Beyond Pipelines \\ Shared Mutable State \\ What Is a Mutex? \\ Mutex \\ mut and Mutex \\ Why Mutexes Are Not Always a Good Idea \\ Deadlock \\ Poisoned Mutexes \\ Multi-Consumer Channels Using Mutexes \\ Read/Write Locks (RwLock) \\ Condition Variables (Condvar) \\ Atomics \\ Global Variables \\ What Hacking Concurrent Code in Rust Is Like \\ 20. Macros \\ Macro Basics \\ Basics of Macro Expansion \\ Unintended Consequences \\ Repetition \\ Built-In Macros \\ Debugging Macros \\ The json! Macro \\ Fragment Types \\ Recursion in Macros \\ Using Traits with Macros \\ Scoping and Hygiene \\ Importing and Exporting Macros \\ Avoiding Syntax Errors During Matching \\ Beyond macro_rules! \\ 21. Unsafe Code \\ Unsafe from What? \\ Unsafe Blocks \\ Example: An Efficient ASCII String Type \\ Unsafe Functions \\ Unsafe Block or Unsafe Function? \\ Undefined Behavior \\ Unsafe Traits \\ Raw Pointers \\ Dereferencing Raw Pointers Safely \\ Example: RefWithFlag \\ Nullable Pointers \\ Type Sizes and Alignments \\ Pointer Arithmetic \\ Moving into and out of Memory \\ Example: GapBuffer \\ Panic Safety in Unsafe Code \\ Foreign Functions: Calling C and C++ from Rust \\ Finding Common Data Representations \\ Declaring Foreign Functions and Variables \\ Using Functions from Libraries \\ A Raw Interface to libgit2 \\ A Safe Interface to libgit2 \\ Conclusion \\ Index", } @Article{Bujanovic:2017:HBA, author = "Zvonimir Bujanovi{\'c} and Lars Karlsson and Daniel Kressner", title = "A {Householder}-based algorithm for {Hessenberg}-triangular reduction", journal = "arxiv.org", volume = "??", number = "??", pages = "??--??", day = "23", month = oct, year = "2017", bibdate = "Fri Dec 21 10:00:58 2018", bibsource = "https://www.math.utah.edu/pub/bibnet/authors/h/householder-alston-s.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "https://arxiv.org/abs/1710.08538", abstract = "The QZ algorithm for computing eigenvalues and eigenvectors of a matrix pencil A B requires that the matrices first be reduced to Hessenberg-triangular (HT) form. The current method of choice for HT reduction relies entirely on Givens rotations regrouped and accumulated into small dense matrices which are subsequently applied using matrix multiplication routines. A non-vanishing fraction of the total flop count must nevertheless still be performed as sequences of overlapping Givens rotations alternately applied from the left and from the right. The many data dependencies associated with this computational pattern leads to inefficient use of the processor and poor scalability. In this paper, we therefore introduce a fundamentally different approach that relies entirely on (large) Householder reflectors partially accumulated into block reflectors, by using (compact) WY representations. Even though the new algorithm requires more floating point operations than the state of the art algorithm, extensive experiments on both real and synthetic data indicate that it is still competitive, even in a sequential setting. The new algorithm is conjectured to have better parallel scalability, an idea which is partially supported by early small-scale experiments using multi-threaded BLAS. The design and evaluation of a parallel formulation is future work.", acknowledgement = ack-nhfb, } @Article{Cao:2017:HRD, author = "Man Cao and Minjia Zhang and Aritra Sengupta and Swarnendu Biswas and Michael D. Bond", title = "Hybridizing and Relaxing Dependence Tracking for Efficient Parallel Runtime Support", journal = j-TOPC, volume = "4", number = "2", pages = "9:1--9:??", month = oct, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3108138", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Tue Oct 10 17:42:07 MDT 2017", bibsource = "http://topc.acm.org/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "It is notoriously challenging to develop parallel software systems that are both scalable and correct. Runtime support for parallelism-such as multithreaded record and replay, data race detectors, transactional memory, and enforcement of stronger memory models-helps achieve these goals, but existing commodity solutions slow programs substantially to track (i.e., detect or control) an execution's cross-thread dependencies accurately. Prior work tracks cross-thread dependencies either ``pessimistically,'' slowing every program access, or ``optimistically,'' allowing for lightweight instrumentation of most accesses but dramatically slowing accesses that are conflicting (i.e., involved in cross-thread dependencies). This article presents two novel approaches that seek to improve the performance of dependence tracking. Hybrid tracking (HT) hybridizes pessimistic and optimistic tracking by overcoming a fundamental mismatch between these two kinds of tracking. HT uses an adaptive, profile-based policy to make runtime decisions about switching between pessimistic and optimistic tracking. Relaxed tracking (RT) attempts to reduce optimistic tracking's overhead on conflicting accesses by tracking dependencies in a ``relaxed'' way-meaning that not all dependencies are tracked accurately-while still preserving both program semantics and runtime support's correctness. To demonstrate the usefulness and potential of HT and RT, we build runtime support based on the two approaches. Our evaluation shows that both approaches offer performance advantages over existing approaches, but there exist challenges and opportunities for further improvement. HT and RT are distinct solutions to the same problem. It is easier to build runtime support based on HT than on RT, although RT does not incur the overhead of online profiling. This article presents the two approaches together to inform and inspire future designs for efficient parallel runtime support.", acknowledgement = ack-nhfb, articleno = "9", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Catalan:2017:TEM, author = "Sandra Catal{\'a}n and Francisco D. Igual and Rafael Mayo and Rafael Rodr{\'\i}guez-S{\'a}nchez and Enrique S. Quintana-Ort{\'\i}", title = "Time and energy modeling of a high-performance multi-threaded {Cholesky} factorization", journal = j-J-SUPERCOMPUTING, volume = "73", number = "1", pages = "139--151", month = jan, year = "2017", CODEN = "JOSUED", DOI = "https://doi.org/10.1007/s11227-016-1654-6", ISSN = "0920-8542 (print), 1573-0484 (electronic)", ISSN-L = "0920-8542", bibdate = "Sat Jun 24 10:31:31 MDT 2017", bibsource = "http://link.springer.com/journal/11227/73/1; https://www.math.utah.edu/pub/tex/bib/jsuper.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "The Journal of Supercomputing", journal-URL = "http://link.springer.com/journal/11227", } @Article{Chen:2017:IGP, author = "Li-Jhan Chen and Hsiang-Yun Cheng and Po-Han Wang and Chia-Lin Yang", title = "Improving {GPGPU} Performance via Cache Locality Aware Thread Block Scheduling", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "16", number = "2", pages = "127--131", month = jul # "\slash " # dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2017.2693371", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Jun 20 17:18:18 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Modern GPGPUs support the concurrent execution of thousands of threads to provide an energy-efficient platform. However, the massive multi-threading of GPGPUs incurs serious cache contention, as the cache lines brought by one thread can easily be evicted by other threads in the small shared cache. In this paper, we propose a software-hardware cooperative approach that exploits the spatial locality among different thread blocks to better utilize the precious cache capacity. Through dynamic locality estimation and thread block scheduling, we can capture more performance improvement opportunities than prior work that only explores the spatial locality between consecutive thread blocks. Evaluations across diverse GPGPU applications show that, on average, our locality-aware scheduler provides 25 and 9 percent performance improvement over the commonly-employed round-robin scheduler and the state-of-the-art scheduler, respectively.", acknowledgement = ack-nhfb, affiliation = "Chen, LJ (Reprint Author), Natl Taiwan Univ, Taipei 10617, Taiwan. Chen, Li-Jhan; Wang, Po-Han; Yang, Chia-Lin, Natl Taiwan Univ, Taipei 10617, Taiwan. Cheng, Hsiang-Yun, Acad Sinica, Taipei 11529, Taiwan.", author-email = "r03922026@csie.ntu.edu.tw hycheng@citi.sinica.edu.tw f96922002@csie.ntu.edu.tw yangc@csie.ntu.edu.tw", da = "2019-06-20", doc-delivery-number = "FR2AX", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Ministry of Science and Technology of Taiwan [MOST-105-2221-E-002-156-MY2, MOST-105-2622-8-002-002, MOST-105-2218-E-002-025]; MediaTek Inc., Hsin-chu, Taiwan", funding-text = "This work is supported in part by research grants from the Ministry of Science and Technology of Taiwan (MOST-105-2221-E-002-156-MY2, MOST-105-2622-8-002-002, and MOST-105-2218-E-002-025), and sponsored by MediaTek Inc., Hsin-chu, Taiwan.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "cache locality; GPGPU; thread block scheduling", number-of-cited-references = "18", research-areas = "Computer Science", times-cited = "0", unique-id = "Chen:2017:IGP", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Cui:2017:MTA, author = "Huanqing Cui and Jian Niu and Chuanai Zhou and Minglei Shu", title = "A Multi-Threading Algorithm to Detect and Remove Cycles in Vertex- and Arc-Weighted Digraph", journal = j-ALGORITHMS-BASEL, volume = "10", number = "4", month = dec, year = "2017", CODEN = "ALGOCH", DOI = "https://doi.org/10.3390/a10040115", ISSN = "1999-4893 (electronic)", ISSN-L = "1999-4893", bibdate = "Fri May 3 13:50:13 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/algorithms.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "https://www.mdpi.com/1999-4893/10/4/115", acknowledgement = ack-nhfb, articleno = "115", fjournal = "Algorithms (Basel)", journal-URL = "https://www.mdpi.com/journal/algorithms", ORCID-numbers = "Huanqing Cui/0000-0002-9251-680X", pagecount = "??", pubdates = "Received: 28 August 2017 / Revised: 26 September 2017 / Accepted: 9 October 2017 / Published: 10 October 2017", } @Article{Dang:2017:ECB, author = "Hoang-Vu Dang and Marc Snir and William Gropp", title = "Eliminating contention bottlenecks in multithreaded {MPI}", journal = j-PARALLEL-COMPUTING, volume = "69", number = "??", pages = "1--23", month = nov, year = "2017", CODEN = "PACOEJ", ISSN = "0167-8191 (print), 1872-7336 (electronic)", ISSN-L = "0167-8191", bibdate = "Tue Oct 24 15:15:02 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib; https://www.math.utah.edu/pub/tex/bib/pvm.bib", URL = "http://www.sciencedirect.com/science/article/pii/S0167819117301187", acknowledgement = ack-nhfb, fjournal = "Parallel Computing", journal-URL = "http://www.sciencedirect.com/science/journal/01678191", } @Article{Dutta:2017:SVC, author = "Sudakshina Dutta and Dipankar Sarkar and Arvind Rawat", title = "Synchronization Validation for Cross-Thread Dependences in Parallel Programs", journal = j-INT-J-PARALLEL-PROG, volume = "45", number = "6", pages = "1326--1365", month = dec, year = "2017", CODEN = "IJPPE5", DOI = "https://doi.org/10.1007/s10766-016-0467-9", ISSN = "0885-7458 (print), 1573-7640 (electronic)", ISSN-L = "0885-7458", bibdate = "Sat Nov 18 09:27:28 MST 2017", bibsource = "http://link.springer.com/journal/10766/45/6; https://www.math.utah.edu/pub/tex/bib/intjparallelprogram.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "International Journal of Parallel Programming", journal-URL = "http://link.springer.com/journal/10766", } @Article{Farzan:2017:SDC, author = "Azadeh Farzan and Victor Nicolet", title = "Synthesis of divide and conquer parallelism for loops", journal = j-SIGPLAN, volume = "52", number = "6", pages = "540--555", month = jun, year = "2017", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/3140587.3062355", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sat Sep 16 10:18:17 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "Divide-and-conquer is a common parallel programming skeleton supported by many cross-platform multithreaded libraries, and most commonly used by programmers for parallelization. The challenges of producing (manually or automatically) a correct divide-and-conquer parallel program from a given sequential code are two-fold: (1) assuming that a good solution exists where individual worker threads execute a code identical to the sequential one, the programmer has to provide the extra code for dividing the tasks and combining the partial results (i.e. joins), and (2) the sequential code may not be suitable for divide-and-conquer parallelization as is, and may need to be modified to become a part of a good solution. We address both challenges in this paper. We present an automated synthesis technique to synthesize correct joins and an algorithm for modifying the sequential code to make it suitable for parallelization when necessary. This paper focuses on class of loops that traverse a read-only collection and compute a scalar function over that collection. We present theoretical results for when the necessary modifications to sequential code are possible, theoretical guarantees for the algorithmic solutions presented here, and experimental evaluation of the approach's success in practice and the quality of the produced parallel programs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "PLDI '17 conference proceedings.", } @Article{Feliu:2017:PFP, author = "J. Feliu and J. Sahuquillo and S. Petit and J. Duato", title = "{Perf Fair}: A Progress-Aware Scheduler to Enhance Performance and Fairness in {SMT} Multicores", journal = j-IEEE-TRANS-COMPUT, volume = "66", number = "5", pages = "905--911", month = may, year = "2017", CODEN = "ITCOB4", DOI = "https://doi.org/10.1109/TC.2016.2620977", ISSN = "0018-9340 (print), 1557-9956 (electronic)", ISSN-L = "0018-9340", bibdate = "Thu Apr 6 07:46:06 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Computers", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12", keywords = "Bandwidth; Estimation; fairness; Interference; multicore; Multicore processing; performance estimation; Processor scheduling; Program processors; Resource management; Scheduling; SMT", } @Article{Gasiunas:2017:FBA, author = "Vaidas Gasiunas and David Dominguez-Sal and Ralph Acker and Aharon Avitzur and Ilan Bronshtein and Rushan Chen and Eli Ginot and Norbert Martinez-Bazan and Michael M{\"u}ller and Alexander Nozdrin and Weijie Ou and Nir Pachter and Dima Sivov and Eliezer Levy", title = "Fiber-based architecture for {NFV} cloud databases", journal = j-PROC-VLDB-ENDOWMENT, volume = "10", number = "12", pages = "1682--1693", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.14778/3137765.3137774", ISSN = "2150-8097", bibdate = "Tue Oct 10 17:16:19 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "The telco industry is gradually shifting from using monolithic software packages deployed on custom hardware to using modular virtualized software functions deployed on cloudified data centers using commodity hardware. This transformation is referred to as Network Function Virtualization (NFV). The scalability of the databases (DBs) underlying the virtual network functions is the cornerstone for reaping the benefits from the NFV transformation. This paper presents an industrial experience of applying shared-nothing techniques in order to achieve the scalability of a DB in an NFV setup. The special combination of requirements in NFV DBs are not easily met with conventional execution models. Therefore, we designed a special shared-nothing architecture that is based on cooperative multi-tasking using user-level threads (fibers). We further show that the fiber-based approach outperforms the approach built using conventional multi-threading and meets the variable deployment needs of the NFV transformation. Furthermore, fibers yield a simpler-to-maintain software and enable controlling a trade-off between long-duration computations and real-time requests.", acknowledgement = ack-nhfb, fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Georgakoudis:2017:SSA, author = "Giorgis Georgakoudis and Hans Vandierendonck and Peter Thoman and Bronis R. {De Supinski} and Thomas Fahringer and Dimitrios S. Nikolopoulos", title = "{SCALO}: Scalability-Aware Parallelism Orchestration for Multi-Threaded Workloads", journal = j-TACO, volume = "14", number = "4", pages = "54:1--54:??", month = dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3158643", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Dec 22 18:25:55 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Shared memory machines continue to increase in scale by adding more parallelism through additional cores and complex memory hierarchies. Often, executing multiple applications concurrently, dividing among them hardware threads, provides greater efficiency rather than executing a single application with large thread counts. However, contention for shared resources can limit the improvement of concurrent application execution: orchestrating the number of threads used by each application and is essential. In this article, we contribute SCALO, a solution to orchestrate concurrent application execution to increase throughput. SCALO monitors co-executing applications at runtime to evaluate their scalability. Its optimizing thread allocator analyzes these scalability estimates to adapt the parallelism of each program. Unlike previous approaches, SCALO differs by including dynamic contention effects on scalability and by controlling the parallelism during the execution of parallel regions. Thus, it improves throughput when other state-of-the-art approaches fail and outperforms them by up to 40\% when they succeed.", acknowledgement = ack-nhfb, articleno = "54", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924", } @Article{Georgiou:2017:ETD, author = "Kyriakos Georgiou and Steve Kerrison and Zbigniew Chamski and Kerstin Eder", title = "Energy Transparency for Deeply Embedded Programs", journal = j-TACO, volume = "14", number = "1", pages = "8:1--8:??", month = apr, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3046679", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jul 24 18:00:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Energy transparency is a concept that makes a program's energy consumption visible, from hardware up to software, through the different system layers. Such transparency can enable energy optimizations at each layer and between layers, as well as help both programmers and operating systems make energy-aware decisions. In this article, we focus on deeply embedded devices, typically used for Internet of Things (IoT) applications, and demonstrate how to enable energy transparency through existing static resource analysis (SRA) techniques and a new target-agnostic profiling technique, without hardware energy measurements. Our novel mapping technique enables software energy consumption estimations at a higher level than the Instruction Set Architecture (ISA), namely the LLVM intermediate representation (IR) level, and therefore introduces energy transparency directly to the LLVM optimizer. We apply our energy estimation techniques to a comprehensive set of benchmarks, including single- and multithreaded embedded programs from two commonly used concurrency patterns: task farms and pipelines. Using SRA, our LLVM IR results demonstrate a high accuracy with a deviation in the range of 1\% from the ISA SRA. Our profiling technique captures the actual energy consumption at the LLVM IR level with an average error of 3\%.", acknowledgement = ack-nhfb, articleno = "8", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924", } @Article{Gupta:2017:DDP, author = "Ujjwal Gupta and Chetan Arvind Patil and Ganapati Bhat and Prabhat Mishra and Umit Y. Ogras", title = "{DyPO}: Dynamic {Pareto}-Optimal Configuration Selection for Heterogeneous {MpSoCs}", journal = j-TECS, volume = "16", number = "5s", pages = "123:1--123:??", month = oct, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3126530", ISSN = "1539-9087 (print), 1558-3465 (electronic)", ISSN-L = "1539-9087", bibdate = "Thu Oct 17 18:16:33 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/tecs.bib", abstract = "Modern multiprocessor systems-on-chip (MpSoCs) offer tremendous power and performance optimization opportunities by tuning thousands of potential voltage, frequency and core configurations. As the workload phases change at runtime, different configurations may become optimal with respect to power, performance or other metrics. Identifying the optimal configuration at runtime is infeasible due to the large number of workloads and configurations. This paper proposes a novel methodology that can find the Pareto-optimal configurations at runtime as a function of the workload. To achieve this, we perform an extensive offline characterization to find classifiers that map performance counters to optimal configurations. Then, we use these classifiers and performance counters at runtime to choose Pareto-optimal configurations. We evaluate the proposed methodology by maximizing the performance per watt for 18 single- and multi-threaded applications. Our experiments demonstrate an average increase of 93\%, 81\% and 6\% in performance per watt compared to the interactive, on demand and powersave governors, respectively.", acknowledgement = ack-nhfb, articleno = "123", fjournal = "ACM Transactions on Embedded Computing Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J840", } @Article{Hankendi:2017:SCS, author = "Can Hankendi and Ayse Kivilcim Coskun", title = "Scale \& Cap: Scaling-Aware Resource Management for Consolidated Multi-threaded Applications", journal = j-TODAES, volume = "22", number = "2", pages = "30:1--30:??", month = mar, year = "2017", CODEN = "ATASFO", DOI = "https://doi.org/10.1145/2994145", ISSN = "1084-4309 (print), 1557-7309 (electronic)", ISSN-L = "1084-4309", bibdate = "Fri Jul 21 10:49:30 MDT 2017", bibsource = "http://www.acm.org/pubs/contents/journals/todaes/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/todaes.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", abstract = "As the number of cores per server node increases, designing multi-threaded applications has become essential to efficiently utilize the available hardware parallelism. Many application domains have started to adopt multi-threaded programming; thus, efficient management of multi-threaded applications has become a significant research problem. Efficient execution of multi-threaded workloads on cloud environments, where applications are often consolidated by means of virtualization, relies on understanding the multi-threaded specific characteristics of the applications. Furthermore, energy cost and power delivery limitations require data center server nodes to work under power caps, which bring additional challenges to runtime management of consolidated multi-threaded applications. This article proposes a dynamic resource allocation technique for consolidated multi-threaded applications for power-constrained environments. Our technique takes into account application characteristics specific to multi-threaded applications, such as power and performance scaling, to make resource distribution decisions at runtime to improve the overall performance, while accurately tracking dynamic power caps. We implement and evaluate our technique on state-of-the-art servers and show that the proposed technique improves the application performance by up to 21\% under power caps compared to a default resource manager.", acknowledgement = ack-nhfb, articleno = "30", fjournal = "ACM Transactions on Design Automation of Electronic Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J776", } @Article{Hroub:2017:EGC, author = "Ayman Hroub and M. E. S. Elrabaa and M. F. Mudawar and A. Khayyat", title = "Efficient Generation of Compact Execution Traces for Multicore Architectural Simulations", journal = j-TACO, volume = "14", number = "3", pages = "27:1--27:??", month = sep, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3106342", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Sep 6 17:12:05 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Requiring no functional simulation, trace-driven simulation has the potential of achieving faster simulation speeds than execution-driven simulation of multicore architectures. An efficient, on-the-fly, high-fidelity trace generation method for multithreaded applications is reported. The generated trace is encoded in an instruction-like binary format that can be directly ``interpreted'' by a timing simulator to simulate a general load/store or x8-like architecture. A complete tool suite that has been developed and used for evaluation of the proposed method showed that it produces smaller traces over existing trace compression methods while retaining good fidelity including all threading- and synchronization-related events.", acknowledgement = ack-nhfb, articleno = "27", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924", } @Article{Jung:2017:LSD, author = "Sungbo Jung and Dar-Jen Chang and Juw Won Park", title = "Large scale document inversion using a multi-threaded computing system", journal = j-SIGAPP, volume = "17", number = "2", pages = "27--35", month = aug, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3131080.3131083", ISSN = "1559-6915 (print), 1931-0161 (electronic)", ISSN-L = "1559-6915", bibdate = "Thu Jan 23 10:25:03 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigapp.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3131080.3131083", abstract = "Current microprocessor architecture is moving towards multi-core/multi-threaded systems. This trend has led to a surge of interest in using multi-threaded computing devices, such as the Graphics Processing Unit (GPU), for general purpose computing. We \ldots{}", acknowledgement = ack-nhfb, fjournal = "ACM SIGAPP Applied Computing Review", journal-URL = "https://dl.acm.org/loi/sigapp", } @Book{Klabnik:2017:RPL, author = "Steve Klabnik and Carol Nichols", title = "The {Rust} Programming Language", publisher = pub-NO-STARCH, address = pub-NO-STARCH:adr, pages = "xxvii + 519", year = "2017", ISBN = "1-59327-828-4 (paperback), 1-59327-851-9 (e-pub)", ISBN-13 = "978-1-59327-828-1 (paperback), 978-1-59327-851-9 (e-pub)", LCCN = "QA76.73.R87 K53 2018", bibdate = "Thu Oct 31 18:42:15 MDT 2019", bibsource = "fsz3950.oclc.org:210/WorldCat; https://www.math.utah.edu/pub/tex/bib/master.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "\booktitle{The Rust Programming Language} is the official book on Rust; a community-developed, systems programming language that runs blazingly fast, prevents segfaults, and guarantees thread safety. Rust's memory safety guarantees, enforced at compile time, safeguard your programs against the many problems that pervade other systems languages. Rust offers the control and performance of a low-level language with the helpful abstractions of a high level one, and does this all without having a garbage collector. These characteristics make Rust useful for embedding in other languages, programs with specific space and time requirements, and writing low-level code, like device drivers and operating systems. \booktitle{The Rust Programming Language} begins with a quick hands-on project to introduce the basics, then explores key concepts in depth, such as ownership, the type system, error handling, and fearless concurrency. Detailed explanations of Rust-oriented takes on topics like pattern matching, iterators, and smart pointers combine with examples and exercises to take you from theory to practice. In addition to its thorough coverage of more granular topics, \booktitle{The Rust Programming Language} will show you how to: * Grasp important concepts unique to Rust like ownership, borrowing, and lifetimes; * Use Cargo, Rust's built-in package manager, to build your code, including downloading and building dependencies; * Effectively use Rust's zero-cost abstractions and learn to build your own. Developed with help from the community, \booktitle{The Rust Programming Language} is your official guide to becoming a productive Rust programmer. The official guide to Rust, a community-developed, systems programming language. Begins with a hands-on project to introduce the basics, then explores key concepts in depth''", acknowledgement = ack-nhfb, libnote = "Not in my library.", subject = "Computer programming; Programming languages (Electronic computers); Computer programming.; Programming languages (Electronic computers)", tableofcontents = "Foreword / by Nicholas Matsakis and Aaron Turon \\ Introduction \\ 1: Getting Started \\ 2: A Quick Tutorial \\ Guessing Game \\ 3: Common Programming Concepts \\ 4: Understanding Ownership \\ 5: Structs \\ 6: Enums and Pattern Matching \\ 7: Modules \\ 8: Common Collections \\ 9: Error Handling \\ 10: Generic Types, Traits, and Lifetimes \\ 11: Testing \\ 12: An Input\slash Output Project \\ 13: Functional Language Features in Rust \\ Iterators and Closures \\ 14: More about Cargo and Crates io \\ 15: Smart Pointers \\ 16: Concurrency \\ 17: Is Rust Object Oriented? \\ 18: Patterns \\ 19: More About Lifetimes \\ 20: Advanced Type System Features \\ Appendix A: Keywords \\ Appendix B: Operators \\ Appendix C: Derivable Traits \\ Appendix D: Nightly Rust\ \\ Nightly Rust \\ Glossary", } @Article{Kleinmann:2017:ACS, author = "Amit Kleinmann and Avishai Wool", title = "Automatic Construction of Statechart-Based Anomaly Detection Models for Multi-Threaded Industrial Control Systems", journal = j-TIST, volume = "8", number = "4", pages = "55:1--55:??", month = jul, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3011018", ISSN = "2157-6904 (print), 2157-6912 (electronic)", ISSN-L = "2157-6904", bibdate = "Sat Dec 23 10:12:41 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/tist.bib", abstract = "Traffic of Industrial Control System (ICS) between the Human Machine Interface (HMI) and the Programmable Logic Controller (PLC) is known to be highly periodic. However, it is sometimes multiplexed, due to asynchronous scheduling. Modeling the network traffic patterns of multiplexed ICS streams using Deterministic Finite Automata (DFA) for anomaly detection typically produces a very large DFA and a high false-alarm rate. In this article, we introduce a new modeling approach that addresses this gap. Our Statechart DFA modeling includes multiple DFAs, one per cyclic pattern, together with a DFA-selector that de-multiplexes the incoming traffic into sub-channels and sends them to their respective DFAs. We demonstrate how to automatically construct the statechart from a captured traffic stream. Our unsupervised learning algorithms first build a Discrete-Time Markov Chain (DTMC) from the stream. Next, we split the symbols into sets, one per multiplexed cycle, based on symbol frequencies and node degrees in the DTMC graph. Then, we create a sub-graph for each cycle and extract Euler cycles for each sub-graph. The final statechart is comprised of one DFA per Euler cycle. The algorithms allow for non-unique symbols, which appear in more than one cycle, and also for symbols that appear more than once in a cycle. We evaluated our solution on traces from a production ICS using the Siemens S7-0x72 protocol. We also stress-tested our algorithms on a collection of synthetically-generated traces that simulated multiplexed ICS traces with varying levels of symbol uniqueness and time overlap. The algorithms were able to split the symbols into sets with 99.6\% accuracy. The resulting statechart modeled the traces with a median false-alarm rate of as low as 0.483\%. In all but the most extreme scenarios, the Statechart model drastically reduced both the false-alarm rate and the learned model size in comparison with the naive single-DFA model.", acknowledgement = ack-nhfb, articleno = "55", fjournal = "ACM Transactions on Intelligent Systems and Technology (TIST)", journal-URL = "http://portal.acm.org/citation.cfm?id=J1318", } @Article{Kojima:2017:HLG, author = "Kensuke Kojima and Atsushi Igarashi", title = "A {Hoare} Logic for {GPU} Kernels", journal = j-TOCL, volume = "18", number = "1", pages = "3:1--3:??", month = apr, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3001834", ISSN = "1529-3785 (print), 1557-945X (electronic)", ISSN-L = "1529-3785", bibdate = "Thu Apr 13 17:53:54 MDT 2017", bibsource = "http://www.acm.org/pubs/contents/journals/tocl/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/pvm.bib; https://www.math.utah.edu/pub/tex/bib/tocl.bib", abstract = "We study a Hoare Logic to reason about parallel programs executed on graphics processing units (GPUs), called GPU kernels. During the execution of GPU kernels, multiple threads execute in lockstep, that is, execute the same instruction simultaneously. When the control branches, the two branches are executed sequentially, but during the execution of each branch only those threads that take it are enabled; after the control converges, all the threads are enabled and again execute in lockstep. In this article, we first consider a semantics in which all threads execute in lockstep (this semantics simplifies the actual execution model of GPUs) and adapt Hoare Logic to this setting by augmenting the usual Hoare triples with an additional component representing the set of enabled threads. It is determined that the soundness and relative completeness of the logic do not hold for all programs; a difficulty arises from the fact that one thread can invalidate the loop termination condition of another thread through shared memory. We overcome this difficulty by identifying an appropriate class of programs for which the soundness and relative completeness hold. Additionally, we discuss thread interleaving, which is present in the actual execution of GPUs but not in the lockstep semantics mentioned above. We show that if a program is race free, then the lockstep and interleaving semantics produce the same result. This implies that our logic is sound and relatively complete for race-free programs, even if the thread interleaving is taken into account.", acknowledgement = ack-nhfb, articleno = "3", fjournal = "ACM Transactions on Computational Logic", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J773", } @Article{Komosinski:2017:MCE, author = "Maciej Komosinski and Szymon Ulatowski", title = "Multithreaded computing in evolutionary design and in artificial life simulations", journal = j-J-SUPERCOMPUTING, volume = "73", number = "5", pages = "2214--2228", month = may, year = "2017", CODEN = "JOSUED", DOI = "https://doi.org/10.1007/s11227-016-1923-4", ISSN = "0920-8542 (print), 1573-0484 (electronic)", ISSN-L = "0920-8542", bibdate = "Sat Jun 24 10:31:33 MDT 2017", bibsource = "http://link.springer.com/journal/11227/73/5; https://www.math.utah.edu/pub/tex/bib/jsuper.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer.com/content/pdf/10.1007/s11227-016-1923-4.pdf", acknowledgement = ack-nhfb, fjournal = "The Journal of Supercomputing", journal-URL = "http://link.springer.com/journal/11227", } @Article{Kopczynski:2017:LSS, author = "Eryk Kopczy{\'n}ski and Szymon Toru{\'n}czyk", title = "{LOIS}: syntax and semantics", journal = j-SIGPLAN, volume = "52", number = "1", pages = "586--598", month = jan, year = "2017", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/3093333.3009876", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sat Sep 16 10:18:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "We present the semantics of an imperative programming language called LOIS (Looping Over Infinite Sets), which allows iterating through certain infinite sets, in finite time. Our semantics intuitively correspond to execution of infinitely many threads in parallel. This allows to merge the power of abstract mathematical constructions into imperative programming. Infinite sets are internally represented using first order formulas over some underlying logical structure, and SMT solvers are employed to evaluate programs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "POPL '17 conference proceedings.", } @Article{Lee:2017:MVN, author = "Doowon Lee and Valeria Bertacco", title = "{MTraceCheck}: Validating Non-Deterministic Behavior of Memory Consistency Models in Post-Silicon Validation", journal = j-COMP-ARCH-NEWS, volume = "45", number = "2", pages = "201--213", month = may, year = "2017", CODEN = "CANED2", DOI = "https://doi.org/10.1145/3140659.3080235", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Sep 15 11:09:14 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigarch.bib", abstract = "This work presents a minimally-intrusive, high-performance, post-silicon validation framework for validating memory consistency in multi-core systems. Our framework generates constrained-random tests that are instrumented with observability-enhancing code for memory consistency verification. For each test, we generate a set of compact signatures reflecting the memory-ordering patterns observed over many executions of the test, with each of the signatures corresponding to a unique memory-ordering pattern. We then leverage an efficient and novel analysis to quickly determine if the observed execution patterns represented by each unique signature abide by the memory consistency model. Our analysis derives its efficiency by exploiting the structural similarities among the patterns observed. We evaluated our framework, MTraceCheck, on two platforms: an x86-based desktop and an ARM-based SoC platform, both running multi-threaded test programs in a bare-metal environment. We show that MTraceCheck reduces the perturbation introduced by the memory-ordering monitoring activity by 93\% on average, compared to a baseline register flushing approach that saves the register's state after each load operation. We also reduce the computation requirements of our consistency checking analysis by 81\% on average, compared to a conventional topological sorting solution. We finally demonstrate the effectiveness of MTraceCheck on buggy designs, by evaluating multiple case studies where it successfully exposes subtle bugs in a full-system simulation environment.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J89", } @Article{Li:2017:EML, author = "Cha V. Li and Vinicius Petrucci and Daniel Moss{\'e}", title = "Exploring Machine Learning for Thread Characterization on Heterogeneous Multiprocessors", journal = j-OPER-SYS-REV, volume = "51", number = "1", pages = "113--123", month = aug, year = "2017", CODEN = "OSRED8", DOI = "https://doi.org/10.1145/3139645.3139664", ISSN = "0163-5980 (print), 1943-586X (electronic)", ISSN-L = "0163-5980", bibdate = "Fri Sep 15 10:37:05 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/opersysrev.bib", abstract = "We introduce a thread characterization method that explores hardware performance counters and machine learning techniques to automate estimating workload execution on heterogeneous processors. We show that our characterization scheme achieves higher accuracy when predicting performance indicators, such as instructions per cycle and last-level cache misses, commonly used to determine the mapping of threads to processor types at runtime. We also show that support vector regression achieves higher accuracy when compared to linear regression, and has very low (1\%) overhead. The results presented in this paper can provide a foundation for advanced investigations and interesting new directions in intelligent thread scheduling and power management on multiprocessors.", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J597", } @Article{Li:2017:GGB, author = "Yuxiang Li and Yinliang Zhao and Qiangsheng Wu", title = "{GbA}: a graph-based thread partition approach in speculative multithreading", journal = j-CCPE, volume = "29", number = "21", pages = "??--??", day = "10", month = nov, year = "2017", CODEN = "CCPEBO", DOI = "https://doi.org/10.1002/cpe.4294", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Sat Dec 30 09:11:58 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/ccpe.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Concurrency and Computation: Practice and Experience", journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626", } @Article{Lin:2017:MSP, author = "Zhongwei Lin and Carl Tropper and Robert A. McDougal and Mohammand Nazrul Ishlam Patoary and William W. Lytton and Yiping Yao and Michael L. Hines", title = "Multithreaded Stochastic {PDES} for Reactions and Diffusions in Neurons", journal = j-TOMACS, volume = "27", number = "2", pages = "7:1--7:??", month = jul, year = "2017", CODEN = "ATMCEZ", DOI = "https://doi.org/10.1145/2987373", ISSN = "1049-3301 (print), 1558-1195 (electronic)", ISSN-L = "1049-3301", bibdate = "Tue Jul 11 15:41:32 MDT 2017", bibsource = "http://www.acm.org/pubs/contents/journals/tomacs/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/tomacs.bib", abstract = "Cells exhibit stochastic behavior when the number of molecules is small. Hence a stochastic reaction-diffusion simulator capable of working at scale can provide a more accurate view of molecular dynamics within the cell. This article describes a parallel discrete event simulator, Neuron Time Warp-Multi Thread (NTW-MT), developed for the simulation of reaction diffusion models of neurons. To the best of our knowledge, this is the first parallel discrete event simulator oriented toward stochastic simulation of chemical reactions in a neuron. The simulator was developed as part of the NEURON project. NTW-MT is optimistic and thread based, which attempts to capitalize on multicore architectures used in high performance machines. It makes use of a multilevel queue for the pending event set and a single rollback message in place of individual antimessages to disperse contention and decrease the overhead of processing rollbacks. Global Virtual Time is computed asynchronously both within and among processes to get rid of the overhead for synchronizing threads. Memory usage is managed in order to avoid locking and unlocking when allocating and deallocating memory and to maximize cache locality. We verified our simulator on a calcium buffer model. We examined its performance on a calcium wave model, comparing it to the performance of a process based optimistic simulator and a threaded simulator which uses a single priority queue for each thread. Our multithreaded simulator is shown to achieve superior performance to these simulators. Finally, we demonstrated the scalability of our simulator on a larger Calcium-Induced Calcium Release (CICR) model and a more detailed CICR model.", acknowledgement = ack-nhfb, articleno = "7", fjournal = "ACM Transactions on Modeling and Computer Simulation", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J781", } @Article{Luo:2017:TDS, author = "Hao Luo and Pengcheng Li and Chen Ding", title = "Thread Data Sharing in Cache: Theory and Measurement", journal = j-SIGPLAN, volume = "52", number = "8", pages = "103--115", month = aug, year = "2017", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/3155284.3018759", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Fri Dec 1 18:56:12 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "On modern multi-core processors, independent workloads often interfere with each other by competing for shared cache space. However, for multi-threaded workloads, where a single copy of data can be accessed by multiple threads, the threads can cooperatively share cache. Because data sharing consolidates the collective working set of threads, the effective size of shared cache becomes larger than it would have been when data are not shared. This paper presents a new theory of data sharing. It includes (1) a new metric called the shared footprint to mathematically compute the amount of data shared by any group of threads in any size cache, and (2) a linear-time algorithm to measure shared footprint by scanning the memory trace of a multi-threaded program. The paper presents the practical implementation and evaluates the new theory using 14 PARSEC and SPEC OMP benchmarks, including an example use of shared footprint in program optimization.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "PPoPP '17 conference proceedings.", } @Article{Marquez:2017:MCH, author = "David Gonzalez Marquez and Adrian Cristal Kestelman and Esteban Mocskos", title = "{Mth}: Codesigned Hardware\slash Software Support for Fine Grain Threads", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "16", number = "1", pages = "64--67", month = jan # "\slash " # jun, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2016.2606383", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Jun 20 17:18:18 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Multi-core processors are ubiquitous in all market segments from embedded to high performance computing, but only few applications can efficiently utilize them. Existing parallel frameworks aim to support thread-level parallelism in applications, but the imposed overhead prevents their usage for small problem instances. This work presents Micro-threads (Mth) a hardware-software proposal focused on a shared thread management model enabling the use of parallel resources in applications that have small chunks of parallel code or small problem inputs by a combination of software and hardware: delegation of the resource control to the application, an improved mechanism to store and fill processor's context, and an efficient synchronization system. Four sample applications are used to test our proposal: HSL filter (trivially parallel), FFT Radix2 (recursive algorithm), LU decomposition (barrier every cycle) and Dantzig algorithm (graph based, matrix manipulation). The results encourage the use of Mth and could smooth the use of multiple cores for applications that currently can not take advantage of the proliferation of the available parallel resources in each chip.", acknowledgement = ack-nhfb, affiliation = "Marquez, DG (Reprint Author), Univ Buenos Aires, Fac Ciencias Exactas \& Nat, Dept Comp Sci, C1428EGA, RA-1053 Buenos Aires, DF, Argentina. Marquez, David Gonzalez; Mocskos, Esteban, Univ Buenos Aires, Fac Ciencias Exactas \& Nat, Dept Comp Sci, C1428EGA, RA-1053 Buenos Aires, DF, Argentina. Mocskos, Esteban, CSC CONICET, C1425FQD, RA-2390 Buenos Aires, DF, Argentina. Kestelman, Adrian Cristal, CSIC, IIIA, Barcelona Supercomp Ctr, ES-08034 Barcelona, Spain. Kestelman, Adrian Cristal, Univ Politecn Cataluna, Dept Comp Architecture, ES-08034 Barcelona, Spain.", author-email = "dmarquez@dc.uba.ar adrian.cristal@bsc.es emocskos@dc.uba.ar", da = "2019-06-20", doc-delivery-number = "EY5PB", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Universidad de Buenos Aires [UBACyT 20020130200096BA]; CONICET [PIP 11220110100379]", funding-text = "This work was partially funded by grants from Universidad de Buenos Aires (UBACyT 20020130200096BA) and CONICET (PIP 11220110100379). The authors thank specially Osman Unsal for reading this article with fruitful criticism.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "multicore processing; multithreading; Parallel architectures; parallel programming", keywords-plus = "PARALLELISM", number-of-cited-references = "11", ORCID-numbers = "Mocskos, Esteban/0000-0002-6473-7672", research-areas = "Computer Science", times-cited = "0", unique-id = "Marquez:2017:MCH", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Matheou:2017:DDC, author = "George Matheou and Paraskevas Evripidou", title = "Data-Driven Concurrency for High Performance Computing", journal = j-TACO, volume = "14", number = "4", pages = "53:1--53:??", month = dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3162014", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Dec 22 18:25:55 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "In this work, we utilize dynamic dataflow/data-driven techniques to improve the performance of high performance computing (HPC) systems. The proposed techniques are implemented and evaluated through an efficient, portable, and robust programming framework that enables data-driven concurrency on HPC systems. The proposed framework is based on data-driven multithreading (DDM), a hybrid control-flow/dataflow model that schedules threads based on data availability on sequential processors. The proposed framework was evaluated using several benchmarks, with different characteristics, on two different systems: a 4-node AMD system with a total of 128 cores and a 64-node Intel HPC system with a total of 768 cores. The performance evaluation shows that the proposed framework scales well and tolerates scheduling overheads and memory latencies effectively. We also compare our framework to MPI, DDM-VM, and OmpSs@Cluster. The comparison results show that the proposed framework obtains comparable or better performance.", acknowledgement = ack-nhfb, articleno = "53", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924", } @Article{Meier:2017:PVM, author = "Remigius Meier and Armin Rigo and Thomas R. Gross", title = "Parallel virtual machines with {RPython}", journal = j-SIGPLAN, volume = "52", number = "2", pages = "48--59", month = feb, year = "2017", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/3093334.2989233", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sat Sep 16 10:18:15 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "The RPython framework takes an interpreter for a dynamic language as its input and produces a Virtual Machine{\^A} (VM) for that language. RPython is being used to develop PyPy, a high-performance Python interpreter. However, the produced VM does not support parallel execution since the framework relies on a Global Interpreter Lock{\^A} (GIL): PyPy serialises the execution of multi-threaded Python programs. We describe the rationale and design of a new parallel execution model for RPython that allows the generation of parallel virtual machines while leaving the language semantics unchanged. This model then allows different implementations of concurrency control, and we discuss an implementation based on a GIL and an implementation based on Software Transactional Memory{\^A} (STM). To evaluate the benefits of either choice, we adapt PyPy to work with both implementations (GIL and STM). The evaluation shows that PyPy with STM improves the runtime of a set of multi-threaded Python programs over PyPy with a GIL by factors in the range of 1.87 $ \times $ up to 5.96 $ \times $ when executing on a processor with 8 cores.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "DLS '16 conference proceedings.", } @Article{Nazarpour:2017:CPS, author = "Hosein Nazarpour and Yli{\`e}s Falcone and Saddek Bensalem and Marius Bozga", title = "Concurrency-preserving and sound monitoring of multi-threaded component-based systems: theory, algorithms, implementation, and evaluation", journal = j-FORM-ASP-COMPUT, volume = "29", number = "6", pages = "951--986", month = nov, year = "2017", CODEN = "FACME5", DOI = "https://doi.org/10.1007/s00165-017-0422-6", ISSN = "0934-5043 (print), 1433-299X (electronic)", ISSN-L = "0934-5043", bibdate = "Thu Nov 23 07:37:44 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/formaspcomput.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer.com/article/10.1007/s00165-017-0422-6", acknowledgement = ack-nhfb, fjournal = "Formal Aspects of Computing", journal-URL = "http://link.springer.com/journal/165", } @Article{Nutaro:2017:HAA, author = "James Nutaro and Bernard Zeigler", title = "How to apply {Amdahl}'s law to multithreaded multicore processors", journal = j-J-PAR-DIST-COMP, volume = "107", number = "??", pages = "1--2", month = sep, year = "2017", CODEN = "JPDCER", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Sat Aug 19 13:10:31 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.sciencedirect.com/science/article/pii/S0743731517300941", acknowledgement = ack-nhfb, fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315", } @InProceedings{Olivieri:2017:IOP, author = "Mauro Olivieri and Abdallah Cheikh and Gianmarco Cerutti and Antonio Mastrandrea and Francesco Menichelli", editor = "{IEEE}", booktitle = "{2017 New Generation of CAS (NGCAS)}", title = "Investigation on the Optimal Pipeline Organization in {RISC-V} Multi-threaded Soft Processor Cores", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "45--48", year = "2017", DOI = "https://doi.org/10.1109/NGCAS.2017.61", bibdate = "Sat Dec 16 15:51:40 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/risc-v.bib", acknowledgement = ack-nhfb, } @Article{Park:2017:HHC, author = "Jaehyun Park and Seungcheol Baek and Hyung Gyu Lee and Chrysostomos Nicopoulos and Vinson Young and Junghee Lee and Jongman Kim", title = "{HoPE}: Hot-Cacheline Prediction for Dynamic Early Decompression in Compressed {LLCs}", journal = j-TODAES, volume = "22", number = "3", pages = "40:1--40:??", month = may, year = "2017", CODEN = "ATASFO", DOI = "https://doi.org/10.1145/2999538", ISSN = "1084-4309 (print), 1557-7309 (electronic)", ISSN-L = "1084-4309", bibdate = "Fri Jul 21 10:49:30 MDT 2017", bibsource = "http://www.acm.org/pubs/contents/journals/todaes/; https://www.math.utah.edu/pub/tex/bib/datacompression.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/todaes.bib", abstract = "Data compression plays a pivotal role in improving system performance and reducing energy consumption, because it increases the logical effective capacity of a compressed memory system without physically increasing the memory size. However, data compression techniques incur some cost, such as non-negligible compression and decompression overhead. This overhead becomes more severe if compression is used in the cache. In this article, we aim to minimize the read-hit decompression penalty in compressed Last-Level Caches (LLCs) by speculatively decompressing frequently used cachelines. To this end, we propose a Hot-cacheline Prediction and Early decompression (HoPE) mechanism that consists of three synergistic techniques: Hot-cacheline Prediction (HP), Early Decompression (ED), and Hit-history-based Insertion (HBI). HP and HBI efficiently identify the hot compressed cachelines, while ED selectively decompresses hot cachelines, based on their size information. Unlike previous approaches, the HoPE framework considers the performance balance/tradeoff between the increased effective cache capacity and the decompression penalty. To evaluate the effectiveness of the proposed HoPE mechanism, we run extensive simulations on memory traces obtained from multi-threaded benchmarks running on a full-system simulation framework. We observe significant performance improvements over compressed cache schemes employing the conventional Least-Recently Used (LRU) replacement policy, the Dynamic Re-Reference Interval Prediction (DRRIP) scheme, and the Effective Capacity Maximizer (ECM) compressed cache management mechanism. Specifically, HoPE exhibits system performance improvements of approximately 11\%, on average, over LRU, 8\% over DRRIP, and 7\% over ECM by reducing the read-hit decompression penalty by around 65\%, over a wide range of applications.", acknowledgement = ack-nhfb, articleno = "40", fjournal = "ACM Transactions on Design Automation of Electronic Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J776", } @Article{Pathania:2017:DTM, author = "Anuj Pathania and Vanchinathan Venkataramani and Muhammad Shafique and Tulika Mitra and J{\"o}rg Henkel", title = "Defragmentation of Tasks in Many-Core Architecture", journal = j-TACO, volume = "14", number = "1", pages = "2:1--2:??", month = apr, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3050437", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jul 24 18:00:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Many-cores can execute multiple multithreaded tasks in parallel. A task performs most efficiently when it is executed over a spatially connected and compact subset of cores so that performance loss due to communication overhead imposed by the task's threads spread across the allocated cores is minimal. Over a span of time, unallocated cores can get scattered all over the many-core, creating fragments in the task mapping. These fragments can prevent efficient contiguous mapping of incoming new tasks leading to loss of performance. This problem can be alleviated by using a task defragmenter, which consolidates smaller fragments into larger fragments wherein the incoming tasks can be efficiently executed. Optimal defragmentation of a many-core is an NP-hard problem in the general case. Therefore, we simplify the original problem to a problem that can be solved optimally in polynomial time. In this work, we introduce a concept of exponentially separable mapping (ESM), which defines a set of task mapping constraints on a many-core. We prove that an ESM enforcing many-core can be defragmented optimally in polynomial time.", acknowledgement = ack-nhfb, articleno = "2", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924", } @Article{Pereira:2017:SBC, author = "Phillipe Pereira and Higo Albuquerque and Isabela da Silva and Hendrio Marques and Felipe Monteiro and Ricardo Ferreira and Lucas Cordeiro", title = "{SMT}-based context-bounded model checking for {CUDA} programs", journal = j-CCPE, volume = "29", number = "22", pages = "??--??", day = "25", month = nov, year = "2017", CODEN = "CCPEBO", DOI = "https://doi.org/10.1002/cpe.3934", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Sat Dec 30 09:11:59 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/ccpe.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Concurrency and Computation: Practice and Experience", journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626", } @Article{Radulovic:2017:LLI, author = "Milan B. Radulovi{\'c} and Sylvain Girbal and Milo V. Tomasevi{\'c}", title = "Low-level implementation of the {SISC} protocol for thread-level speculation on a multi-core architecture", journal = j-PARALLEL-COMPUTING, volume = "67", number = "??", pages = "1--19", month = sep, year = "2017", CODEN = "PACOEJ", ISSN = "0167-8191 (print), 1872-7336 (electronic)", ISSN-L = "0167-8191", bibdate = "Wed Aug 9 14:49:25 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/parallelcomputing.bibo", URL = "http://www.sciencedirect.com/science/article/pii/S0167819117300972", acknowledgement = ack-nhfb, fjournal = "Parallel Computing", journal-URL = "http://www.sciencedirect.com/science/journal/01678191", } @Article{Reiche:2017:AVI, author = "Oliver Reiche and Christof Kobylko and Frank Hannig and J{\"u}rgen Teich", title = "Auto-vectorization for image processing {DSLs}", journal = j-SIGPLAN, volume = "52", number = "4", pages = "21--30", month = may, year = "2017", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/3140582.3081039", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sat Sep 16 10:18:15 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "The parallelization of programs and distributing their workloads to multiple threads can be a challenging task. In addition to multi-threading, harnessing vector units in CPUs proves highly desirable. However, employing vector units to speed up programs can be quite tedious. Either a program developer solely relies on the auto-vectorization capabilities of the compiler or he manually applies vector intrinsics, which is extremely error-prone, difficult to maintain, and not portable at all. Based on whole-function vectorization, a method to replace control flow with data flow, we propose auto-vectorization techniques for image processing DSLs in the context of source-to-source compilation. The approach does not require the input to be available in SSA form. Moreover, we formulate constraints under which the vectorization analysis and code transformations may be greatly simplified in the context of image processing DSLs. As part of our methodology, we present control flow to data flow transformation as a source-to-source translation. Moreover, we propose a method to efficiently analyze algorithms with mixed bit-width data types to determine the optimal SIMD width, independently of the target instruction set. The techniques are integrated into an open source DSL framework. Subsequently, the vectorization capabilities are compared to a variety of existing state-of-the-art C/C++ compilers. A geometric mean speedup of up to 3.14 is observed for benchmarks taken from ISPC and image processing, compared to non-vectorized executions.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "LCTES '17 conference proceedings.", } @Article{Saarikivi:2017:MTS, author = "Olli Saarikivi and Hern{\'a}n Ponce-De-Le{\'o}n and Kari K{\"a}hk{\"o}nen and Keijo Heljanko and Javier Esparza", title = "Minimizing Test Suites with Unfoldings of Multithreaded Programs", journal = j-TECS, volume = "16", number = "2", pages = "45:1--45:??", month = apr, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3012281", ISSN = "1539-9087 (print), 1558-3465 (electronic)", ISSN-L = "1539-9087", bibdate = "Mon Jul 24 09:51:12 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/tecs.bib", abstract = "This article focuses on computing minimal test suites for multithreaded programs. Based on previous work on test case generation for multithreaded programs using unfoldings, this article shows how this unfolding can be used to generate minimal test suites covering all local states of the program. Generating such minimal test suites is shown to be NP-complete in the size of the unfolding. We propose an SMT encoding for this problem and two methods based on heuristics which only approximate the solution, but scale better in practice. Finally, we apply our methods to compute the minimal test suites for several benchmarks.", acknowledgement = ack-nhfb, articleno = "45", fjournal = "ACM Transactions on Embedded Computing Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J840", } @Article{Sanderson:2017:PGP, author = "Conrad Sanderson and Ryan Curtin", title = "\pkg{gmm\_diag} and \pkg{gmm\_full}: {C++} classes for multi-threaded {Gaussian} mixture models and Expectation-Maximisation", journal = j-J-OPEN-SOURCE-SOFT, volume = "2", number = "18", pages = "365:1--365:2", month = oct, year = "2017", CODEN = "????", DOI = "https://doi.org/10.21105/joss.00365", ISSN = "2475-9066", ISSN-L = "2475-9066", bibdate = "Thu Sep 13 08:09:35 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/joss.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://joss.theoj.org/papers/10.21105/joss.00365", acknowledgement = ack-nhfb, fjournal = "Journal of Open Source Software", journal-URL = "http://joss.theoj.org/; https://github.com/openjournals/joss-papers/", onlinedate = "16 October 2017", ORCID-numbers = "Conrad Sanderson / 0000-0002-0049-4501; Ryan Curtin / 0000-0002-9903-8214", } @Article{Schafer:2017:PHL, author = "Benjamin Carrion Schafer", title = "Parallel High-Level Synthesis Design Space Exploration for Behavioral {IPs} of Exact Latencies", journal = j-TODAES, volume = "22", number = "4", pages = "65:1--65:??", month = jul, year = "2017", CODEN = "ATASFO", DOI = "https://doi.org/10.1145/3041219", ISSN = "1084-4309 (print), 1557-7309 (electronic)", ISSN-L = "1084-4309", bibdate = "Mon Jan 22 09:03:32 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/todaes.bib", abstract = "This works presents a Design Space Exploration (DSE) method for Behavioral IPs (BIPs) given in ANSI-C or SystemC to find the smallest micro-architecture for a specific target latency. Previous work on High-Level Synthesis (HLS) DSE mainly focused on finding a tradeoff curve with Pareto-optimal designs. HLS is, however, a single process (component) synthesis method. Very often, the latency of the components requires a specific fixed latency when inserted within a larger system. This work presents a fast multi-threaded method to find the smallest micro-architecture for a given BIP and target latency by discriminating between all different exploration knobs and exploring these concurrently. Experimental results show that our proposed method is very effective and comprehensive results compare the quality of results vs. the speedup of your proposed explorer.", acknowledgement = ack-nhfb, articleno = "65", fjournal = "ACM Transactions on Design Automation of Electronic Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J776", } @Article{Tian:2017:RSP, author = "Z. Tian and T. Liu and Q. Zheng and E. Zhuang and M. Fan and Z. Yang", title = "Reviving Sequential Program Birthmarking for Multithreaded Software Plagiarism Detection", journal = j-IEEE-TRANS-SOFTW-ENG, volume = "PP", number = "99", pages = "1--1", month = "????", year = "2017", CODEN = "IESEDJ", DOI = "https://doi.org/10.1109/TSE.2017.2688383", ISSN = "0098-5589 (print), 1939-3520 (electronic)", ISSN-L = "0098-5589", bibdate = "Thu Feb 1 19:49:24 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib; https://www.math.utah.edu/pub/tex/bib/ieeetranssoftweng2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=7888597", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Software Engineering", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=32", } @Article{Turakhia:2017:TPE, author = "Yatish Turakhia and Guangshuo Liu and Siddharth Garg and Diana Marculescu", title = "Thread Progress Equalization: Dynamically Adaptive Power-Constrained Performance Optimization of Multi-Threaded Applications", journal = j-IEEE-TRANS-COMPUT, volume = "66", number = "4", pages = "731--744", month = "????", year = "2017", CODEN = "ITCOB4", DOI = "https://doi.org/10.1109/TC.2016.2608951", ISSN = "0018-9340 (print), 1557-9956 (electronic)", ISSN-L = "0018-9340", bibdate = "Sat Mar 11 14:24:09 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Computers", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12", } @Article{Utterback:2017:POR, author = "Robert Utterback and Kunal Agrawal and I-Ting Angelina Lee and Milind Kulkarni", title = "Processor-Oblivious Record and Replay", journal = j-SIGPLAN, volume = "52", number = "8", pages = "145--161", month = aug, year = "2017", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/3155284.3018764", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Fri Dec 1 18:56:12 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/pvm.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "Record-and-replay systems are useful tools for debugging non-deterministic parallel programs by first recording an execution and then replaying that execution to produce the same access pattern. Existing record-and-replay systems generally target thread-based execution models, and record the behaviors and interleavings of individual threads. Dynamic multithreaded languages and libraries, such as the Cilk family, OpenMP, TBB, etc., do not have a notion of threads. Instead, these languages provide a processor-oblivious model of programming, where programs expose task-parallelism using high-level constructs such as spawn/sync without regard to the number of threads/cores available to run the program. Thread-based record-and-replay would violate the processor-oblivious nature of these programs, as they incorporate the number of threads into the recorded information, constraining the replayed execution to the same number of threads. In this paper, we present a processor-oblivious record-and-replay scheme for such languages where record and replay can use different number of processors and both are scheduled using work stealing. We provide theoretical guarantees for our record and replay scheme --- namely that record is optimal for programs with one lock and replay is near-optimal for all cases. In addition, we implemented this scheme in the Cilk Plus runtime system and our evaluation indicates that processor-obliviousness does not cause substantial overheads.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "PPoPP '17 conference proceedings.", } @Article{Wang:2017:JRJ, author = "Kaiyuan Wang and Sarfraz Khurshid and Milos Gligoric", title = "{JPR}: Replaying {JPF} Traces Using Standard {JVM}", journal = j-SIGSOFT, volume = "42", number = "4", pages = "1--5", month = oct, year = "2017", CODEN = "SFENDP", DOI = "https://doi.org/10.1145/3149485.3149494", ISSN = "0163-5948 (print), 1943-5843 (electronic)", ISSN-L = "0163-5948", bibdate = "Wed Aug 1 17:16:48 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/java2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigsoft2010.bib", abstract = "Java PathFinder (JPF) is a backtrackable Java Virtual Machine (JVM), which is implemented in Java and runs on a standard JVM (e.g., Oracle HotSpot). Thus, a JPF developer can use off-the-shelf Java debuggers (e.g., jdb) when debugging code that makes up JPF. JPF explores all non-deterministic executions of a given target program and monitors for property violations. To facilitate debugging of the target program, JPF can capture and replay the execution trace that leads to a property violation. While the deterministic replay is invaluable, the replay with JPF does not allow the developer to attach an off-the-shelf Java debugger to the target program (e.g., step through the application code, set breakpoints, etc.). We present a technique, dubbed JPR, to improve the debugging experience of the JPF captured traces by migrating the JPF traces to a new format that can be executed using the standard JVM. JPR annotates each JPF trace, during the capture phase, with extra data (e.g., instruction index, instruction count, etc.); the annotated trace is then used to instrument Java bytecode to enforce the same execution trace on a standard JVM. JPR is compatible with various optimizations, e.g., state matching and partial-order reduction. We evaluated JPR on all multithreaded Java programs in the official JPF distribution. Our results show that JPR successfully replayed all JPF traces on the standard JVM with reasonable overhead during both recording and replaying.", acknowledgement = ack-nhfb, fjournal = "ACM SIGSOFT Software Engineering Notes", journal-URL = "https://dl.acm.org/citation.cfm?id=J728", } @Article{Yeh:2017:PFG, author = "Tsung Tai Yeh and Amit Sabne and Putt Sakdhnagool and Rudolf Eigenmann and Timothy G. Rogers", title = "{Pagoda}: Fine-Grained {GPU} Resource Virtualization for Narrow Tasks", journal = j-SIGPLAN, volume = "52", number = "8", pages = "221--234", month = aug, year = "2017", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/3155284.3018754", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Fri Dec 1 18:56:12 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/pvm.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "Massively multithreaded GPUs achieve high throughput by running thousands of threads in parallel. To fully utilize the hardware, workloads spawn work to the GPU in bulk by launching large tasks, where each task is a kernel that contains thousands of threads that occupy the entire GPU. GPUs face severe underutilization and their performance benefits vanish if the tasks are narrow, i.e., they contain {$<$} 500 threads. Latency-sensitive applications in network, signal, and image processing that generate a large number of tasks with relatively small inputs are examples of such limited parallelism. This paper presents Pagoda, a runtime system that virtualizes GPU resources, using an OS-like daemon kernel called MasterKernel. Tasks are spawned from the CPU onto Pagoda as they become available, and are scheduled by the MasterKernel at the warp granularity. Experimental results demonstrate that Pagoda achieves a geometric mean speedup of 5.70x over PThreads running on a 20-core CPU, 1.51x over CUDA-HyperQ, and 1.69x over GeMTC, the state-of- the-art runtime GPU task scheduling system.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "PPoPP '17 conference proceedings.", } @Article{Adams:2018:TTV, author = "Joel C. Adams and Patrick A. Crain and Christopher P. Dilley and Christiaan D. Hazlett and Elizabeth R. Koning and Serita M. Nelesen and Javin B. Unger and Mark B. Vande Stel", title = "{TSGL}: A tool for visualizing multithreaded behavior", journal = j-J-PAR-DIST-COMP, volume = "118 (part 1)", number = "??", pages = "233--246", month = aug, year = "2018", CODEN = "JPDCER", DOI = "https://doi.org/10.1016/j.jpdc.2018.02.025", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Sat May 12 16:27:31 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "https://www.sciencedirect.com/science/article/pii/S0743731518301035", acknowledgement = ack-nhfb, fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315", } @Article{AlBarakat:2018:MFM, author = "Laith M. AlBarakat and Paul {Gratz, V} and Daniel A. Jimenez", title = "{MTB-Fetch}: Multithreading Aware Hardware Prefetching for Chip Multiprocessors", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "2", pages = "175--178", month = jul # "\slash " # dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2018.2847345", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Jun 20 17:18:18 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "To fully exploit the scaling performance in Chip Multiprocessors, applications must be divided into semi-independent processes that can run concurrently on multiple cores within a system. One major class of such applications, shared-memory, multi-threaded applications, requires programmers insert thread synchronization primitives (i.e., locks, barriers, and condition variables) in their critical sections to synchronize data access between processes. For this class of applications, scaling performance requires balanced per-thread workloads with little time spent in critical sections. In practice, however, threads often waste significant time waiting to acquire locks/barriers in their critical sections, leading to thread imbalance and poor performance scaling. Moreover, critical sections often stall data prefetchers that mitigate the effects of long critical section stalls by ensuring data is preloaded in the core caches when the critical section is complete. In this paper we examine a pure hardware technique to enable safe data prefetching beyond synchronization points in CMPs. We show that successful prefetching beyond synchronization points requires overcoming two significant challenges in existing prefetching techniques. First, we find that typical data prefetchers are designed to trigger prefetches based on current misses. This approach this works well for traditional, continuously executing, single-threaded applications. However, when a thread stalls on a synchronization point, it typically does not produce any new memory references to trigger a prefetcher. Second, even in the event that a prefetch were to be correctly directed to read beyond a synchronization point, it will likely prefetch shared data from another core before this data has been written. While this prefetch would be considered ``{accurate''} it is highly undesirable, because such a prefetch would lead to three extra ``ping-{pong''} movements back and forth between private caches in the producing and consuming cores, incurring more latency and energy overhead than without prefetching. We develop a new data prefetcher, Multi-Thread B-Fetch (MTBFetch), built as an extension to a previous single-threaded data prefetcher. MTBFetch addresses both issues in prefetching for shared memory multi-threaded workloads. MTB-Fetch achieves a speedup of 9.3 percent for multi-threaded applications with little additional hardware.", acknowledgement = ack-nhfb, affiliation = "AlBarakat, LM (Reprint Author), Texas A\&M Univ, Dept Elect \& Comp Engn, College Stn, TX 77843 USA. AlBarakat, Laith M.; Gratz, Paul, V, Texas A\&M Univ, Dept Elect \& Comp Engn, College Stn, TX 77843 USA. Jimenez, Daniel A., Texas A\&M Univ, Dept Comp Sci \& Engn, College Stn, TX 77843 USA.", author-email = "lalbarakat@tamu.edu pgratz@tamu.edu djimenez@cse.tamu.edu", da = "2019-06-20", doc-delivery-number = "GP4TI", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "National Science Foundation [I/UCRC-1439722, CCF-1649242, CCF-1216604/1332598]; Intel Corp.", funding-text = "We thank the National Science Foundation, which partially supported this work through grants I/UCRC-1439722, CCF-1649242 and CCF-1216604/1332598 and Intel Corp. for their generous support.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Chip multiprocessor; hardware prefetching; multi-threading; shared memory", keywords-plus = "PROCESSORS", number-of-cited-references = "17", research-areas = "Computer Science", times-cited = "1", unique-id = "AlBarakat:2018:MFM", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Amer:2018:LCM, author = "Abdelhalim Amer and Huiwei Lu and Pavan Balaji and Milind Chabbi and Yanjie Wei and Jeff Hammond and Satoshi Matsuoka", title = "Lock Contention Management in Multithreaded {MPI}", journal = j-TOPC, volume = "5", number = "3", pages = "12:1--12:??", month = jan, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3275443", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Wed Jan 23 16:12:26 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/pvm.bib; https://www.math.utah.edu/pub/tex/bib/topc.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3275443", abstract = "In this article, we investigate contention management in lock-based thread-safe MPI libraries. Specifically, we make two assumptions: (1) locks are the only form of synchronization when protecting communication paths; and (2) contention occurs, and thus serialization is unavoidable. Our work distinguishes between lock acquisitions with respect to work being performed inside a critical section; productive vs. unproductive. Waiting for message reception without doing anything else inside a critical section is an example of unproductive lock acquisition. We show that the high-throughput nature of modern scalable locking protocols translates into better communication progress for throughput-intensive MPI communication but negatively impacts latency-sensitive communication because of overzealous unproductive lock acquisition. To reduce unproductive lock acquisitions, we devised a method that promotes threads with productive work using a generic two-level priority locking protocol. Our results show that using a high-throughput protocol for productive work and a fair protocol for less productive code paths ensures the best tradeoff for fine-grained communication, whereas a fair protocol is sufficient for more coarse-grained communication. Although these efforts have been rewarding, scalability degradation remains significant. We discuss techniques that diverge from the pure locking model and offer the potential to further improve scalability.", acknowledgement = ack-nhfb, articleno = "12", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Catalan:2018:MTD, author = "Sandra Catal{\'a}n and Jos{\'e} R. Herrero and Francisco D. Igual and Rafael Rodr{\'\i}guez-S{\'a}nchez and Enrique S. Quintana-Ort{\'\i} and Chris Adeniyi-Jones", title = "Multi-threaded dense linear algebra libraries for low-power asymmetric multicore processors", journal = j-J-COMPUT-SCI, volume = "25", pages = "140--151", month = mar, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1016/j.jocs.2016.10.020", ISSN = "1877-7503 (print), 1877-7511 (electronic)", ISSN-L = "1877-7503", bibdate = "Tue Sep 19 13:54:39 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jcomputsci.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "https://www.sciencedirect.com/science/article/pii/S1877750316302812", acknowledgement = ack-nhfb, ajournal = "J. Comput. Sci.", fjournal = "Journal of Computational Science", journal-URL = "https://www.sciencedirect.com/journal/journal-of-computational-science", } @Article{Chen:2018:ESE, author = "Kuan-Chung Chen and Chung-Ho Chen", title = "Enabling {SIMT} Execution Model on Homogeneous Multi-Core System", journal = j-TACO, volume = "15", number = "1", pages = "6:1--6:??", month = apr, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3177960", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:19:59 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Single-instruction multiple-thread (SIMT) machine emerges as a primary computing device in high-performance computing, since the SIMT execution paradigm can exploit data-level parallelism effectively. This article explores the SIMT execution potential on homogeneous multi-core processors, which generally run in multiple-instruction multiple-data (MIMD) mode when utilizing the multi-core resources. We address three architecture issues in enabling SIMT execution model on multi-core processor, including multithreading execution model, kernel thread context placement, and thread divergence. For the SIMT execution model, we propose a fine-grained multithreading mechanism on an ARM-based multi-core system. Each of the processor cores stores the kernel thread contexts in its L1 data cache for per-cycle thread-switching requirement. For divergence-intensive kernels, an Inner Conditional Statement First (ICS-First) mechanism helps early re-convergence to occur and significantly improves the performance. The experiment results show that effectiveness in data-parallel processing reduces on average 36\% dynamic instructions, and boosts the SIMT executions to achieve on average 1.52$ \times $ and up to 5$ \times $ speedups over the MIMD counterpart for OpenCL benchmarks for single issue in-order processor cores. By using the explicit vectorization optimization on the kernels, the SIMT model gains further benefits from the SIMD extension and achieves 1.71$ \times $ speedup over the MIMD approach. The SIMT model using in-order superscalar processor cores outperforms the MIMD model that uses superscalar out-of-order processor cores by 40\%. The results show that, to exploit data-level parallelism, enabling the SIMT model on homogeneous multi-core processors is important.", acknowledgement = ack-nhfb, articleno = "6", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924", } @Article{Chen:2018:ROM, author = "Kuan-Hsun Chen and Georg von der Br{\"u}ggen and Jian-Jia Chen", title = "Reliability Optimization on Multi-Core Systems with Multi-Tasking and Redundant Multi-Threading", journal = j-IEEE-TRANS-COMPUT, volume = "67", number = "4", pages = "484--497", month = "????", year = "2018", CODEN = "ITCOB4", DOI = "https://doi.org/10.1109/TC.2017.2769044", ISSN = "0018-9340 (print), 1557-9956 (electronic)", ISSN-L = "0018-9340", bibdate = "Thu Mar 15 08:52:31 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://ieeexplore.ieee.org/document/8094023/", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Computers", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12", } @Article{Chin:2018:EAN, author = "Wei-Sheng Chin and Bo-Wen Yuan and Meng-Yuan Yang and Chih-Jen Lin", title = "An Efficient Alternating {Newton} Method for Learning Factorization Machines", journal = j-TIST, volume = "9", number = "6", pages = "72:1--72:??", month = nov, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3230710", ISSN = "2157-6904 (print), 2157-6912 (electronic)", ISSN-L = "2157-6904", bibdate = "Thu Nov 15 16:23:08 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/tist.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3230710", abstract = "To date, factorization machines (FMs) have emerged as a powerful model in many applications. In this work, we study the training of FM with the logistic loss for binary classification, which is a nonlinear extension of the linear model with the logistic loss (i.e., logistic regression). For the training of large-scale logistic regression, Newton methods have been shown to be an effective approach, but it is difficult to apply such methods to FM because of the nonconvexity. We consider a modification of FM that is multiblock convex and propose an alternating minimization algorithm based on Newton methods. Some novel optimization techniques are introduced to reduce the running time. Our experiments demonstrate that the proposed algorithm is more efficient than stochastic gradient algorithms and coordinate descent methods. The parallelism of our method is also investigated for the acceleration in multithreading environments.", acknowledgement = ack-nhfb, articleno = "72", fjournal = "ACM Transactions on Intelligent Systems and Technology (TIST)", journal-URL = "http://portal.acm.org/citation.cfm?id=J1318", } @Article{Deiana:2018:UPN, author = "Enrico A. Deiana and Vincent St-Amour and Peter A. Dinda and Nikos Hardavellas and Simone Campanoni", title = "Unconventional Parallelization of Nondeterministic Applications", journal = j-SIGPLAN, volume = "53", number = "2", pages = "432--447", month = feb, year = "2018", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/3296957.3173181", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Oct 16 14:12:56 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "The demand for thread-level-parallelism (TLP) on commodity processors is endless as it is essential for gaining performance and saving energy. However, TLP in today's programs is limited by dependences that must be satisfied at run time. We have found that for nondeterministic programs, some of these actual dependences can be satisfied with alternative data that can be generated in parallel, thus boosting the program's TLP. Satisfying these dependences with alternative data nonetheless produces final outputs that match those of the original nondeterministic program. To demonstrate the practicality of our technique, we describe the design, implementation, and evaluation of our compilers, autotuner, profiler, and runtime, which are enabled by our proposed C++ programming language extensions. The resulting system boosts the performance of six well-known nondeterministic and multi-threaded benchmarks by 158.2\% (geometric mean) on a 28-core Intel-based platform.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "ASPLOS '18 proceedings.", } @Article{DeLozier:2018:SSO, author = "Christian DeLozier and Ariel Eizenberg and Brandon Lucia and Joseph Devietti", title = "{SOFRITAS}: Serializable Ordering-Free Regions for Increasing Thread Atomicity Scalably", journal = j-SIGPLAN, volume = "53", number = "2", pages = "286--300", month = feb, year = "2018", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/3296957.3173192", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Oct 16 14:12:56 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "Correctly synchronizing multithreaded programs is challenging and errors can lead to program failures such as atomicity violations. Existing strong memory consistency models rule out some possible failures, but are limited by depending on programmer-defined locking code. We present the new Ordering-Free Region (OFR) serializability consistency model that ensures atomicity for OFRs, which are spans of dynamic instructions between consecutive ordering constructs (e.g., barriers), without breaking atomicity at lock operations. Our platform, Serializable Ordering-Free Regions for Increasing Thread Atomicity Scalably (SOFRITAS), ensures a C/C++ program's execution is equivalent to a serialization of OFRs by default. We build two systems that realize the SOFRITAS idea: a concurrency bug finding tool for testing called SOFRITEST, and a production runtime system called SOPRO. SOFRITEST uses OFRs to find concurrency bugs, including a multi-critical-section atomicity violation in memcached that weaker consistency models will miss. If OFR's are too coarse-grained, SOFRITEST suggests refinement annotations automatically. Our software-only SOPRO implementation has high performance, scales well with increased parallelism, and prevents failures despite bugs in locking code. SOFRITAS has an average overhead of just 1.59x on a single-threaded execution and 1.51x on sixteen threads, despite pthreads' much weaker memory model.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "ASPLOS '18 proceedings.", } @Article{DePestel:2018:RRP, author = "Sander {De Pestel} and Sam {Van den Steen} and Shoaib Akram and Lieven Eeckhout", title = "{RPPM}: Rapid Performance Prediction of Multithreaded Applications on Multicore Hardware", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "2", pages = "183--186", month = jul # "\slash " # dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2018.2849983", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Jun 20 17:18:18 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This paper proposes RPPM which, based on a microarchitecture-independent profile of a multithreaded application, predicts its performance on a previously unseen multicore platform. RPPM breaks up multithreaded program execution into epochs based on synchronization primitives, and then predicts per-epoch active execution times for each thread and synchronization overhead to arrive at a prediction for overall application performance. RPPM predicts performance within 12 percent on average (27 percent max error) compared to cycle-level simulation. We present a case study to illustrate that RPPM can be used for making accurate multicore design trade-offs early in the design cycle.", acknowledgement = ack-nhfb, affiliation = "De Pestel, S (Reprint Author), Univ Ghent, B-9000 Ghent, Belgium. De Pestel, Sander; Van den Steen, Sam; Akram, Shoaib; Eeckhout, Lieven, Univ Ghent, B-9000 Ghent, Belgium.", author-email = "sander.depestel@ugent.be sam.vandensteen@ugent.be shoaib.akram@ugent.be lieven.eeckhout@ugent.be", da = "2019-06-20", doc-delivery-number = "GP4TI", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "Agency for Innovation by Science and Technology in Flanders (IWT); European Research Council (ERC) [741097]", funding-text = "Sander De Pestel is supported through a doctoral fellowship by the Agency for Innovation by Science and Technology in Flanders (IWT). Additional support is provided through the European Research Council (ERC) Advanced Grant agreement no. 741097.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "micro-architecture; Modeling; multi-threaded; performance", number-of-cited-references = "12", ORCID-numbers = "Van den Steen, Sam/0000-0003-3630-2214", research-areas = "Computer Science", times-cited = "1", unique-id = "Pestel:2018:RRP", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Deveci:2018:MSM, author = "Mehmet Deveci and Christian Trott and Sivasankaran Rajamanickam", title = "Multithreaded sparse matrix--matrix multiplication for many-core and {GPU} architectures", journal = j-PARALLEL-COMPUTING, volume = "78", number = "??", pages = "33--46", month = oct, year = "2018", CODEN = "PACOEJ", DOI = "https://doi.org/10.1016/j.parco.2018.06.009", ISSN = "0167-8191 (print), 1872-7336 (electronic)", ISSN-L = "0167-8191", bibdate = "Mon Jan 7 15:25:20 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/parallelcomputing.bib", URL = "http://www.sciencedirect.com/science/article/pii/S0167819118301923", acknowledgement = ack-nhfb, fjournal = "Parallel Computing", journal-URL = "http://www.sciencedirect.com/science/journal/01678191", } @Article{Ding:2018:IOC, author = "Bailu Ding and Lucja Kot and Johannes Gehrke", title = "Improving optimistic concurrency control through transaction batching and operation reordering", journal = j-PROC-VLDB-ENDOWMENT, volume = "12", number = "2", pages = "169--182", month = oct, year = "2018", CODEN = "????", DOI = "https://doi.org/10.14778/3282495.3282502", ISSN = "2150-8097", bibdate = "Wed Jan 2 18:29:48 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "OLTP systems can often improve throughput by batching transactions and processing them as a group. Batching has been used for optimizations such as message packing and group commits; however, there is little research on the benefits of a holistic approach to batching across a transaction's entire life cycle. In this paper, we present a framework to incorporate batching at multiple stages of transaction execution for OLTP systems based on optimistic concurrency control. Storage batching enables reordering of transaction reads and writes at the storage layer, reducing conflicts on the same object. Validator batching enables reordering of transactions before validation, reducing conflicts between transactions. Dependencies between transactions make transaction reordering a non-trivial problem, and we propose several efficient and practical algorithms that can be customized to various transaction precedence policies such as reducing tail latency. We also show how to reorder transactions with a thread-aware policy in multi-threaded OLTP architecture without a centralized validator. In-depth experiments on a research prototype, an opensource OLTP system, and a production OLTP system show that our techniques increase transaction throughput by up to 2.2x and reduce their tail latency by up to 71\% compared with the start-of-the-art systems on workloads with high data contention.", acknowledgement = ack-nhfb, fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Drechsler:2018:TSR, author = "Joscha Drechsler and Ragnar Mogk and Guido Salvaneschi and Mira Mezini", title = "Thread-safe reactive programming", journal = j-PACMPL, volume = "2", number = "OOPSLA", pages = "107:1--107:30", month = oct, year = "2018", DOI = "https://doi.org/10.1145/3276477", bibdate = "Sat Aug 8 07:56:30 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/pacmpl.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3276477", abstract = "The execution of an application written in a reactive language involves transfer of data and control flow between imperative and reactive abstractions at well-defined points. In a multi-threaded environment, multiple such interactions may execute \ldots{}", acknowledgement = ack-nhfb, articleno = "107", fjournal = "Proceedings of the ACM on Programming Languages", journal-URL = "https://pacmpl.acm.org/", } @Article{Fix:2018:HMT, author = "Jordan Fix and Nayana P. Nagendra and Sotiris Apostolakis and Hansen Zhang and Sophie Qiu and David I. August", title = "Hardware Multithreaded Transactions", journal = j-SIGPLAN, volume = "53", number = "2", pages = "15--29", month = feb, year = "2018", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/3296957.3173172", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Oct 16 14:12:56 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "Speculation with transactional memory systems helps programmers and compilers produce profitable thread-level parallel programs. Prior work shows that supporting transactions that can span multiple threads, rather than requiring transactions be contained within a single thread, enables new types of speculative parallelization techniques for both programmers and parallelizing compilers. Unfortunately, software support for multi-threaded transactions (MTXs) comes with significant additional inter-thread communication overhead for speculation validation. This overhead can make otherwise good parallelization unprofitable for programs with sizeable read and write sets. Some programs using these prior software MTXs overcame this problem through significant efforts by expert programmers to minimize these sets and optimize communication, capabilities which compiler technology has been unable to equivalently achieve. Instead, this paper makes speculative parallelization less laborious and more feasible through low-overhead speculation validation, presenting the first complete design, implementation, and evaluation of hardware MTXs. Even with maximal speculation validation of every load and store inside transactions of tens to hundreds of millions of instructions, profitable parallelization of complex programs can be achieved. Across 8 benchmarks, this system achieves a geomean speedup of 99\% over sequential execution on a multicore machine with 4 cores.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "ASPLOS '18 proceedings.", } @Article{Forsell:2018:RMM, author = "Martti Forsell and Jussi Roivainen and Ville Lepp{\"a}nen", title = "{REPLICA MBTAC}: multithreaded dual-mode processor", journal = j-J-SUPERCOMPUTING, volume = "74", number = "5", pages = "1911--1933", month = may, year = "2018", CODEN = "JOSUED", DOI = "https://doi.org/10.1007/s11227-017-2199-z", ISSN = "0920-8542 (print), 1573-0484 (electronic)", ISSN-L = "0920-8542", bibdate = "Thu Oct 10 15:31:12 MDT 2019", bibsource = "http://link.springer.com/journal/11227/74/5; https://www.math.utah.edu/pub/tex/bib/jsuper.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "The Journal of Supercomputing", journal-URL = "http://link.springer.com/journal/11227", } @Article{Gerbessiotis:2018:SIS, author = "Alexandros V. Gerbessiotis", title = "A Study of Integer Sorting on Multicores", journal = j-PARALLEL-PROCESS-LETT, volume = "28", number = "04", pages = "??--??", month = dec, year = "2018", DOI = "https://doi.org/10.1142/S0129626418500147", ISSN = "0129-6264 (print), 1793-642X (electronic)", ISSN-L = "0129-6264", bibdate = "Mon Mar 29 12:30:05 MDT 2021", bibsource = "http://ejournals.wspc.com.sg/ppl/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/parallelprocesslett.bib", URL = "https://www.worldscientific.com/doi/10.1142/S0129626418500147", abstract = "Integer sorting on multicores and GPUs can be realized by a variety of approaches that include variants of distribution-based methods such as radix-sort, comparison-oriented algorithms such as deterministic regular sampling and random sampling parallel sorting, and network-based algorithms such as Batcher's bitonic sorting algorithm. In this work we present an experimental study of integer sorting on multicore processors. We have implemented serial and parallel radix-sort for various radixes, deterministic regular oversampling, and random oversampling parallel sorting, including new variants of ours, and also some previously little explored or unexplored variants of bitonic-sort and odd-even transposition sort. The study uses multithreading and multiprocessing parallel programming libraries with the same C language code working under Open MPI, MulticoreBSP, and BSPlib. We first provide some general high-level observations on the performance of these implementations. If we can conclude anything is that accurate prediction of performance by taking into consideration architecture dependent features such as the structure and characteristics of multiple memory hierarchies is difficult and more often than not untenable. To some degree this is affected by the overhead imposed by the high-level library used in the programming effort. Another objective is to model the performance of these algorithms and their implementations under the MBSP (Multi-memory BSP) model. Despite the limitations mentioned above, we can still draw some reliable conclusions and reason about the performance of these implementations using the MBSP model, thus making MBSP useful and usable.", acknowledgement = ack-nhfb, fjournal = "Parallel Processing Letters", journal-URL = "http://www.worldscientific.com/loi/ppl", } @Article{Gu:2018:CCA, author = "Ronghui Gu and Zhong Shao and Jieung Kim and Xiongnan (Newman) Wu and J{\'e}r{\'e}mie Koenig and Vilhelm Sj{\"o}berg and Hao Chen and David Costanzo and Tahina Ramananandro", title = "Certified concurrent abstraction layers", journal = j-SIGPLAN, volume = "53", number = "4", pages = "646--661", month = apr, year = "2018", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/3296979.3192381", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Oct 16 14:12:57 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "Concurrent abstraction layers are ubiquitous in modern computer systems because of the pervasiveness of multithreaded programming and multicore hardware. Abstraction layers are used to hide the implementation details (e.g., fine-grained synchronization) and reduce the complex dependencies among components at different levels of abstraction. Despite their obvious importance, concurrent abstraction layers have not been treated formally. This severely limits the applicability of layer-based techniques and makes it difficult to scale verification across multiple concurrent layers. In this paper, we present CCAL---a fully mechanized programming toolkit developed under the CertiKOS project---for specifying, composing, compiling, and linking certified concurrent abstraction layers. CCAL consists of three technical novelties: a new game-theoretical, strategy-based compositional semantic model for concurrency (and its associated program verifiers), a set of formal linking theorems for composing multithreaded and multicore concurrent layers, and a new CompCertX compiler that supports certified thread-safe compilation and linking. The CCAL toolkit is implemented in Coq and supports layered concurrent programming in both C and assembly. It has been successfully applied to build a fully certified concurrent OS kernel with fine-grained locking.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "PLDI '18 proceedings.", } @Article{Hukerikar:2018:RIA, author = "Saurabh Hukerikar and Keita Teranishi and Pedro C. Diniz and Robert F. Lucas", title = "{RedThreads}: An Interface for Application-Level Fault Detection\slash Correction Through Adaptive Redundant Multithreading", journal = j-INT-J-PARALLEL-PROG, volume = "46", number = "2", pages = "225--251", month = apr, year = "2018", CODEN = "IJPPE5", DOI = "https://doi.org/10.1007/s10766-017-0492-3", ISSN = "0885-7458 (print), 1573-7640 (electronic)", ISSN-L = "0885-7458", bibdate = "Fri Oct 11 08:37:50 MDT 2019", bibsource = "http://link.springer.com/journal/10766/46/2; https://www.math.utah.edu/pub/tex/bib/intjparallelprogram.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "International Journal of Parallel Programming", journal-URL = "http://link.springer.com/journal/10766", } @Article{Iliakis:2018:DMS, author = "Konstantinos Iliakis and Sotirios Xydis and Dimitrios Soudris", title = "Decoupled {MapReduce} for Shared-Memory Multi-Core Architectures", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "2", pages = "143--146", month = jul # "\slash " # dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2018.2827929", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Jun 20 17:18:18 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Modern multi-core processors exhibit high integration densities, e.g., up to several tens of cores. Multiple programming frameworks have emerged to facilitate the development of highly parallel applications. The MapReduce programming model, after having demonstrated its usability in the area of distributed computing systems, has been adapted to the needs of shared-memory multi-processors showing promising results in comparison with conventional multi-threaded libraries, e.g., pthreads. In this paper we enhance the traditional MapReduce architecture by decoupling the map and combine phases in order to boost parallel execution. We show that combiners' memory intensive features limit the system's degree of parallelism, thus resulting in sub-optimal hardware utilization, leaving space for further performance improvements. The proposed decoupled MapReduce architecture is evaluated into a NUMA server platform, showing that the adoption of the De-MapR runtime enables more efficient hardware utilization and competent run-time improvements. We demonstrate that the proposed solution achieves execution speedups of up to 2.46x compared to a state-of-the-art, shared-memory MapReduce library.", acknowledgement = ack-nhfb, affiliation = "Iliakis, K (Reprint Author), Natl Tech Univ Athens, Zografos 15780, Greece. Iliakis, Konstantinos; Xydis, Sotirios; Soudris, Dimitrios, Natl Tech Univ Athens, Zografos 15780, Greece.", author-email = "konstantinos.iliakis@cern.ch sxydis@microlab.ntua.gr dsoudris@microlab.ntua.gr", da = "2019-06-20", doc-delivery-number = "GP4TI", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "MapReduce; multi-cores; runtime systems", number-of-cited-references = "13", ORCID-numbers = "Soudris, Dimitrios/0000-0002-6930-6847", research-areas = "Computer Science", researcherid-numbers = "Soudris, Dimitrios/O-8843-2019", times-cited = "0", unique-id = "Iliakis:2018:DMS", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Jacobs:2018:MTV, author = "Bart Jacobs and Dragan Bosnacki and Ruurd Kuiper", title = "Modular Termination Verification of Single-Threaded and Multithreaded Programs", journal = j-TOPLAS, volume = "40", number = "3", pages = "12:1--12:??", month = aug, year = "2018", CODEN = "ATPSDT", DOI = "https://doi.org/10.1145/3210258", ISSN = "0164-0925 (print), 1558-4593 (electronic)", ISSN-L = "0164-0925", bibdate = "Thu Oct 18 12:01:50 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/toplas.bib", abstract = "We propose an approach for the modular specification and verification of total correctness properties of object-oriented programs. The core of our approach is a specification style that prescribes a way to assign a level expression to each method such that each callee's level is below the caller's, even in the presence of dynamic binding. The specification style yields specifications that properly hide implementation details. The main idea is to use multisets of method names as levels, and to associate with each object levels that abstractly reflect the way the object is built from other objects. A method's level is then defined in terms of the method's own name and the levels associated with the objects passed as arguments. We first present the specification style in the context of programs that do not modify object fields. We then combine it with separation logic and abstract predicate families to obtain an approach for programs with heap mutation. In a third step, we address concurrency, by incorporating an existing approach for verifying deadlock freedom of channels and locks. Our main contribution here is to achieve information hiding by using the proposed termination levels for lock ordering as well. Also, we introduce call permissions to enable elegant verification of termination of programs where threads cause work in other threads, such as in thread pools or fine-grained concurrent algorithms involving compare-and-swap loops. We explain how our approach can be used also to verify the liveness of nonterminating programs.", acknowledgement = ack-nhfb, articleno = "12", fjournal = "ACM Transactions on Programming Languages and Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783", } @Article{Kahkonen:2018:TPC, author = "Kari K{\"a}hk{\"o}nen and Keijo Heljanko", title = "Testing Programs with Contextual Unfoldings", journal = j-TECS, volume = "17", number = "1", pages = "23:1--23:??", month = jan, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/2810000", ISSN = "1539-9087 (print), 1558-3465 (electronic)", ISSN-L = "1539-9087", bibdate = "Thu Oct 17 18:16:34 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/tecs.bib", abstract = "In this article, we present a new algorithm that combines contextual unfoldings and dynamic symbolic execution to systematically test multithreaded programs. The approach uses symbolic execution to limit the number of input values and unfoldings to thus limit the number of thread interleavings that are needed to cover reachable local states of threads in the program under test. We show that the use of contextual unfoldings allows interleavings of threads to be succinctly represented. This can in some cases lead to a substantial reduction in the number of needed test executions when compared to previous approaches.", acknowledgement = ack-nhfb, articleno = "23", fjournal = "ACM Transactions on Embedded Computing Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J840", } @Article{Kislal:2018:ECC, author = "Orhan Kislal and Jagadish Kotra and Xulong Tang and Mahmut Taylan Kandemir and Myoungsoo Jung", title = "Enhancing computation-to-core assignment with physical location information", journal = j-SIGPLAN, volume = "53", number = "4", pages = "312--327", month = apr, year = "2018", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/3296979.3192386", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Oct 16 14:12:57 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "Going beyond a certain number of cores in modern architectures requires an on-chip network more scalable than conventional buses. However, employing an on-chip network in a manycore system (to improve scalability) makes the latencies of the data accesses issued by a core non-uniform. This non-uniformity can play a significant role in shaping the overall application performance. This work presents a novel compiler strategy which involves exposing architecture information to the compiler to enable an optimized computation-to-core mapping. Specifically, we propose a compiler-guided scheme that takes into account the relative positions of (and distances between) cores, last-level caches (LLCs) and memory controllers (MCs) in a manycore system, and generates a mapping of computations to cores with the goal of minimizing the on-chip network traffic. The experimental data collected using a set of 21 multi-threaded applications reveal that, on an average, our approach reduces the on-chip network latency in a 6$ \times $6 manycore system by 38.4\% in the case of private LLCs, and 43.8\% in the case of shared LLCs. These improvements translate to the corresponding execution time improvements of 10.9\% and 12.7\% for the private LLC and shared LLC based systems, respectively.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "PLDI '18 proceedings.", } @Article{Kondguli:2018:BUS, author = "Sushant Kondguli and Michael Huang", title = "{Bootstrapping}: Using {SMT} Hardware to Improve Single-Thread Performance", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "2", pages = "205--208", month = jul # "\slash " # dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2018.2859945", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Jun 20 17:18:18 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Decoupled look-ahead (DLA) architectures have been shown to be an effective way to improve single-thread performance. However, a default implementation requires an additional core. While an SMT flavor is possible, a naive implementation is inefficient and thus slow. In this paper, we propose an optimized implementation called Bootstrapping that makes DLA just as effective on a single (SMT) core as using two cores. While fusing two cores can improve single-thread performance by 1.23x, Bootstrapping provides a speedup of 1.51.", acknowledgement = ack-nhfb, affiliation = "Kondguli, S (Reprint Author), Univ Rochester, Dept Elect \& Comp Engn, Rochester, NY 14627 USA. Kondguli, Sushant; Huang, Michael, Univ Rochester, Dept Elect \& Comp Engn, Rochester, NY 14627 USA.", author-email = "sushant.kondguli@rochester.edu michael.huang@rochester.edu", da = "2019-06-20", doc-delivery-number = "HA2CO", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", funding-acknowledgement = "NSF [1514433, 1533842]", funding-text = "This work is supported in part by NSF under grants 1514433 and 1533842.", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Decoupled look-ahead (DLA) architectures; simultaneous multi-threading (SMT); single thread performance", number-of-cited-references = "20", research-areas = "Computer Science", times-cited = "1", unique-id = "Kondguli:2018:BUS", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Lee:2018:ERD, author = "I-Ting Angelina Lee and Tao B. Schardl", title = "Efficient Race Detection for Reducer Hyperobjects", journal = j-TOPC, volume = "4", number = "4", pages = "20:1--20:??", month = sep, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3205914", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Wed Jan 23 16:12:25 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "A multithreaded Cilk program that is ostensibly deterministic may nevertheless behave nondeterministically due to programming errors in the code. For a Cilk program that uses reducers-a general reduction mechanism supported in various Cilk dialects-such programming errors are especially challenging to debug, because the errors can expose the nondeterminism in how the Cilk runtime system manages reducers. We identify two unique types of races that arise from incorrect use of reducers in a Cilk program, and we present two algorithms to catch these races. The first algorithm, called the Peer-Set algorithm, detects view-read races, which occur when the program attempts to retrieve a value out of a reducer when the read may result in a nondeterministic value, such as before all previously spawned subcomputations that might update the reducer have necessarily returned. The second algorithm, called the SP+ algorithm, detects determinacy races-instances where a write to a memory location occurs logically in parallel with another access to that location-even when the raced-on memory locations relate to reducers. Both algorithms are provably correct, asymptotically efficient, and can be implemented efficiently in practice. We have implemented both algorithms in our prototype race detector, Rader. When running Peer-Set, Rader incurs a geometric-mean multiplicative overhead of 2.56 over running the benchmark without instrumentation. When running SP+, Rader incurs a geometric-mean multiplicative overhead of 16.94.", acknowledgement = ack-nhfb, articleno = "20", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Liu:2018:ISI, author = "Hongyu Liu and Sam Silvestro and Wei Wang and Chen Tian and Tongping Liu", title = "{iReplayer}: in-situ and identical record-and-replay for multithreaded applications", journal = j-SIGPLAN, volume = "53", number = "4", pages = "344--358", month = apr, year = "2018", CODEN = "SINODQ", DOI = "https://doi.org/10.1145/3296979.3192380", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Oct 16 14:12:57 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", abstract = "Reproducing executions of multithreaded programs is very challenging due to many intrinsic and external non-deterministic factors. Existing RnR systems achieve significant progress in terms of performance overhead, but none targets the in-situ setting, in which replay occurs within the same process as the recording process. Also, most existing work cannot achieve identical replay, which may prevent the reproduction of some errors. This paper presents iReplayer, which aims to identically replay multithreaded programs in the original process (under the ``in-situ'' setting). The novel in-situ and identical replay of iReplayer makes it more likely to reproduce errors, and allows it to directly employ debugging mechanisms (e.g. watchpoints) to aid failure diagnosis. Currently, iReplayer only incurs 3\% performance overhead on average, which allows it to be always enabled in the production environment. iReplayer enables a range of possibilities, and this paper presents three examples: two automatic tools for detecting buffer overflows and use-after-free bugs, and one interactive debugging tool that is integrated with GDB.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J706", remark = "PLDI '18 proceedings.", } @Article{Lochbihler:2018:MTS, author = "Andreas Lochbihler", title = "Mechanising a Type-Safe Model of Multithreaded {Java} with a Verified Compiler", journal = j-J-AUTOM-REASON, volume = "61", number = "1--4", pages = "243--332", month = jun, year = "2018", CODEN = "JAREEW", DOI = "https://doi.org/10.1007/s10817-018-9452-x", ISSN = "0168-7433 (print), 1573-0670 (electronic)", ISSN-L = "0168-7433", bibdate = "Sat Aug 4 07:51:41 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/jautomreason.bib; https://www.math.utah.edu/pub/tex/bib/java2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer.com/article/10.1007/s10817-018-9452-x", acknowledgement = ack-nhfb, fjournal = "Journal of Automated Reasoning", journal-URL = "http://link.springer.com/journal/10817", } @Article{Maabreh:2018:MHT, author = "Majdi Maabreh and Hafez Irshid and Ajay Gupta and Izzat Alasmadi", title = "A multithreading and hashing technique for indexing {Target--Decoy} peptides databases", journal = j-CCPE, volume = "30", number = "9", pages = "??--??", day = "10", month = may, year = "2018", CODEN = "CCPEBO", DOI = "https://doi.org/10.1002/cpe.4371", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Sat Aug 4 10:03:13 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/ccpe.bib; https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "https://onlinelibrary.wiley.com/doi/abs/10.1002/cpe.4371", acknowledgement = ack-nhfb, fjournal = "Concurrency and Computation: Practice and Experience", journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626", } @InProceedings{Malakhov:2018:CMT, author = "Anton Malakhov and David Liu and Anton Gorshkov and Terry Wilmarth", editor = "Fatih Akici and David Lippa and Dillon Niederhut and M Pacer", booktitle = "Proceedings of the {17th Python in Science Conference, Austin, TX, 9--15 July 2018}", title = "Composable Multi-Threading and Multi-Processing for Numeric Libraries", publisher = "????", address = "????", pages = "15--21", year = "2018", bibdate = "Wed Aug 1 09:03:36 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/pvm.bib; https://www.math.utah.edu/pub/tex/bib/python.bib", URL = "http://conference.scipy.org/proceedings/scipy2018/anton_malakhov.html", abstract = "Python is popular among scientific communities that value its simplicity and power, especially as it comes along with numeric libraries such as NumPy, SciPy, Dask, and Numba. As CPU core counts keep increasing, these modules can make use of many cores via multi-threading for efficient multi-core parallelism. However, threads can interfere with each other leading to overhead and inefficiency if used together in a single application on machines with a large number of cores. This performance loss can be prevented if all multi-threaded modules are coordinated. This paper continues the work started in AMala16 by introducing more approaches to coordination for both multi-threading and multi-processing cases. In particular, we investigate the use of static settings, limiting the number of simultaneously active OpenMP parallel regions, and optional parallelism with Intel Threading Building Blocks (Intel TBB). We will show how these approaches help to unlock additional performance for numeric applications on multi-core systems.", acknowledgement = ack-nhfb, keywords = "Dask; GIL; Joblib; Multi-core; Multi-processing; Multi-threading; Nested Parallelism; NumPy; OpenMP; Oversubscription; Parallel Computations; Python; SciPy; TBB", } @Article{Muller:2018:CPG, author = "Stefan K. Muller and Umut A. Acar and Robert Harper", title = "Competitive parallelism: getting your priorities right", journal = j-PACMPL, volume = "2", number = "ICFP", pages = "95:1--95:30", month = jul, year = "2018", DOI = "https://doi.org/10.1145/3236790", bibdate = "Fri Aug 7 17:44:42 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/pacmpl.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3236790", abstract = "Multi-threaded programs have traditionally fallen into one of two domains: cooperative and competitive. These two domains have traditionally remained mostly disjoint, with cooperative threading used for increasing throughput in compute-intensive \ldots{}", acknowledgement = ack-nhfb, articleno = "95", fjournal = "Proceedings of the ACM on Programming Languages", journal-URL = "https://pacmpl.acm.org/", } @Article{Pham:2018:TSM, author = "Binh Pham and Derek Hower and Abhishek Bhattacharjee and Trey Cain", title = "{TLB} Shootdown Mitigation for Low-Power Many-Core Servers with {L1} Virtual Caches", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "1", pages = "17--20", month = jan # "\slash " # jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2017.2712140", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Jun 20 17:18:18 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Power efficiency has become one of the most important design constraints for high-performance systems. In this paper, we revisit the design of low-power virtually-addressed caches. While virtually-addressed caches enable significant power savings by obviating the need for Translation Lookaside Buffer (TLB) lookups, they suffer from several challenging design issues that curtail their widespread commercial adoption. We focus on one of these challenges-cache flushes due to virtual page remappings. We use detailed studies on an ARM many-core server to show that this problem degrades performance by up to 25 percent for a mix of multi-programmed and multi-threaded workloads. Interestingly, we observe that many of these flushes are spurious, and caused by an indiscriminate invalidation broadcast on ARM architecture. In response, we propose a low-overhead and readily implementable hardware mechanism using bloom filters to reduce spurious invalidations and mitigate their ill effects.", acknowledgement = ack-nhfb, affiliation = "Pham, B (Reprint Author), Rutgers State Univ, Dept Comp Sci, Piscataway, NJ 08854 USA. Binh Pham; Bhattacharjee, Abhishek, Rutgers State Univ, Dept Comp Sci, Piscataway, NJ 08854 USA. Hower, Derek, Qualcomm Technol Inc, Piscataway, NJ 08854 USA. Cain, Trey, Qualcomm Datactr Technol Inc, Piscataway, NJ 08854 USA.", author-email = "binhpham@rutgers.edu dhower@qti.qualcomm.com abhib@rutgers.edu tcain@qti.qualcomm.com", da = "2019-06-20", doc-delivery-number = "FZ6EO", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "multicores; multiprogramming; multithreading; TLB; Virtual Cache; virtual memory", number-of-cited-references = "21", research-areas = "Computer Science", times-cited = "0", unique-id = "Pham:2018:TSM", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Polap:2018:MTL, author = "Dawid Polap and Marcin Wo{\'z}niak and Wei Wei and Robertas Damasevicius", title = "Multi-threaded learning control mechanism for neural networks", journal = j-FUT-GEN-COMP-SYS, volume = "87", number = "??", pages = "16--34", month = oct, year = "2018", CODEN = "FGSEVI", ISSN = "0167-739X (print), 1872-7115 (electronic)", ISSN-L = "0167-739X", bibdate = "Tue Jun 26 08:47:57 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/futgencompsys.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "https://www.sciencedirect.com/science/article/pii/S0167739X18300931", acknowledgement = ack-nhfb, fjournal = "Future Generation Computer Systems", journal-URL = "http://www.sciencedirect.com/science/journal/0167739X", } @Article{Roberts:2018:MID, author = "Malcolm Roberts and John C. Bowman", title = "Multithreaded implicitly dealiased convolutions", journal = j-J-COMPUT-PHYS, volume = "356", number = "??", pages = "98--114", day = "1", month = mar, year = "2018", CODEN = "JCTPAH", ISSN = "0021-9991 (print), 1090-2716 (electronic)", ISSN-L = "0021-9991", bibdate = "Sat Jan 13 12:33:11 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/jcomputphys2015.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.sciencedirect.com/science/article/pii/S0021999117308641", acknowledgement = ack-nhfb, fjournal = "Journal of Computational Physics", journal-URL = "http://www.sciencedirect.com/science/journal/00219991", } @Article{Sahin:2018:CSC, author = "Semih Sahin and Bugra Gedik", title = "{C-Stream}: a Co-routine-Based Elastic Stream Processing Engine", journal = j-TOPC, volume = "4", number = "3", pages = "15:1--15:??", month = apr, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3184120", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Wed Jan 23 16:12:25 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "Stream processing is a computational paradigm for on-the-fly processing of live data. This paradigm lends itself to implementations that can provide high throughput and low latency by taking advantage of various forms of parallelism that are naturally captured by the stream processing model of computation, such as pipeline, task, and data parallelism. In this article, we describe the design and implementation of C-Stream, which is an elastic stream processing engine. C-Stream encompasses three unique properties. First, in contrast to the widely adopted event-based interface for developing streaming operators, C-Stream provides an interface wherein each operator has its own driver loop and relies on data availability application programming interfaces (APIs) to decide when to perform its computations. This self-control-based model significantly simplifies the development of operators that require multiport synchronization. Second, C-Stream contains a dynamic scheduler that manages the multithreaded execution of the operators. The scheduler, which is customizable via plug-ins, enables the execution of the operators as co-routines, using any number of threads. The base scheduler implements back-pressure, provides data availability APIs, and manages preemption and termination handling. Last, C-Stream varies the degree of parallelism to resolve bottlenecks by both dynamically changing the number of threads used to execute an application and adjusting the number of replicas of data-parallel operators. We provide an experimental evaluation of C-Stream. The results show that C-Stream is scalable, highly customizable, and can resolve bottlenecks by dynamically adjusting the level of data parallelism used.", acknowledgement = ack-nhfb, articleno = "15", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Sangaiah:2018:SSA, author = "Karthik Sangaiah and Michael Lui and Radhika Jagtap and Stephan Diestelhorst and Siddharth Nilakantan and Ankit More and Baris Taskin and Mark Hempstead", title = "{SynchroTrace}: Synchronization-Aware Architecture-Agnostic Traces for Lightweight Multicore Simulation of {CMP} and {HPC} Workloads", journal = j-TACO, volume = "15", number = "1", pages = "2:1--2:??", month = apr, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3158642", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:19:59 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Trace-driven simulation of chip multiprocessor (CMP) systems offers many advantages over execution-driven simulation, such as reducing simulation time and complexity, allowing portability, and scalability. However, trace-based simulation approaches have difficulty capturing and accurately replaying multithreaded traces due to the inherent nondeterminism in the execution of multithreaded programs. In this work, we present SynchroTrace, a scalable, flexible, and accurate trace-based multithreaded simulation methodology. By recording synchronization events relevant to modern threading libraries (e.g., Pthreads and OpenMP) and dependencies in the traces, independent of the host architecture, the methodology is able to accurately model the nondeterminism of multithreaded programs for different hardware platforms and threading paradigms. Through capturing high-level instruction categories, the SynchroTrace average CPI trace Replay timing model offers fast and accurate simulation of many-core in-order CMPs. We perform two case studies to validate the SynchroTrace simulation flow against the gem5 full-system simulator: (1) a constraint-based design space exploration with traditional CMP benchmarks and (2) a thread-scalability study with HPC-representative applications. The results from these case studies show that (1) our trace-based approach with trace filtering has a peak speedup of up to 18.7$ \times $ over simulation in gem5 full-system with an average of 9.6$ \times $ speedup, (2) SynchroTrace maintains the thread-scaling accuracy of gem5 and can efficiently scale up to 64 threads, and (3) SynchroTrace can trace in one platform and model any platform in early stages of design.", acknowledgement = ack-nhfb, articleno = "2", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924", } @Article{Schmitt:2018:RHG, author = "Christian Schmitt and Moritz Schmid and Sebastian Kuckuk and Harald K{\"o}stler and J{\"u}rgen Teich and Frank Hannig", title = "Reconfigurable Hardware Generation of Multigrid Solvers with Conjugate Gradient Coarse-Grid Solution", journal = j-PARALLEL-PROCESS-LETT, volume = "28", number = "04", pages = "??--??", month = dec, year = "2018", DOI = "https://doi.org/10.1142/S0129626418500160", ISSN = "0129-6264 (print), 1793-642X (electronic)", ISSN-L = "0129-6264", bibdate = "Mon Mar 29 12:30:05 MDT 2021", bibsource = "http://ejournals.wspc.com.sg/ppl/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/parallelprocesslett.bib", URL = "https://www.worldscientific.com/doi/10.1142/S0129626418500160", abstract = "Not only in the field of high-performance computing (HPC), field programmable gate arrays (FPGAs) are a soaringly popular accelerator technology. However, they use a completely different programming paradigm and tool set compared to central processing units (CPUs) or even graphics processing units (GPUs), adding extra development steps and requiring special knowledge, hindering widespread use in scientific computing. To bridge this programmability gap, domain-specific languages (DSLs) are a popular choice to generate low-level implementations from an abstract algorithm description. In this work, we demonstrate our approach for the generation of numerical solver implementations based on the multigrid method for FPGAs from the same code base that is also used to generate code for CPUs using a hybrid parallelization of MPI and OpenMP. Our approach yields in a hardware design that can compute up to 11 V-cycles per second with an input grid size of 4096 {\texttimes} \{\texttimes} {\texttimes} 4096 and solution on the coarsest using the conjugate gradient (CG) method on a mid-range FPGA, beating vectorized, multi-threaded execution on an Intel Xeon processor.", acknowledgement = ack-nhfb, fjournal = "Parallel Processing Letters", journal-URL = "http://www.worldscientific.com/loi/ppl", } @Article{Scionti:2018:EMM, author = "Alberto Scionti and Somnath Mazumdar and Stephane Zuckerman", title = "Enabling Massive Multi-Threading with Fast Hashing", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "17", number = "1", pages = "1--4", month = jan # "\slash " # jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1109/LCA.2017.2697863", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Thu Jun 20 17:18:18 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The next generation of high-performance computers is expected to execute threads in orders of magnitude higher than today's systems. Improper management of such huge amount of threads can create resource contention, leading to overall degraded system performance. By leveraging more practical approaches to distribute threads on the available resources, execution models and manycore chips are expected to overcome limitations of current systems. Here, we present DELTA --- a Data-Enabled muLti-Threaded Architecture, where a producer-consumer scheme is used to execute threads via complete distributed thread management mechanism. We consider a manycore tiled-chip architecture where Network-on-Chip (NoC) routers are extended to support our execution model. The proposed extension is analysed, while simulation results confirm that DELTA can manage a large number of simultaneous threads, relying on a simple hardware structure.", acknowledgement = ack-nhfb, affiliation = "Scionti, A (Reprint Author), ISMB, I-10138 Turin, Italy. Scionti, Alberto, ISMB, I-10138 Turin, Italy. Mazumdar, Somnath, Univ Siena, Siena, SI, Italy. Zuckerman, Stephane, Michigan Technol Univ, Houghton, MI 49931 USA.", author-email = "scionti@ismb.it mazumdar@dii.unisi.it szuckerm@mtu.edu", da = "2019-06-20", doc-delivery-number = "FZ6EO", eissn = "1556-6064", fjournal = "IEEE Computer Architecture Letters", journal-iso = "IEEE Comput. Archit. Lett.", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Dataflow; hashing; network-on-chip; thread-scheduling", number-of-cited-references = "13", research-areas = "Computer Science", times-cited = "1", unique-id = "Scionti:2018:EMM", web-of-science-categories = "Computer Science, Hardware \& Architecture", } @Article{Tang:2018:CND, author = "Xulong Tang and Mahmut Taylan Kandemir and Hui Zhao and Myoungsoo Jung and Mustafa Karakoy", title = "Computing with Near Data", journal = j-POMACS, volume = "2", number = "3", pages = "42:1--42:30", month = dec, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3287321", ISSN = "2476-1249", ISSN-L = "2476-1249", bibdate = "Mon Mar 29 10:31:29 MDT 2021", bibsource = "http://portal.acm.org/https://www.math.utah.edu/pub/tex/bib/pomacs.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "https://dl.acm.org/doi/10.1145/3287321", abstract = "One cost that plays a significant role in shaping the overall performance of both single-threaded and multi-thread applications in modern computing systems is the cost of moving data between compute elements and storage elements. Traditional approaches \ldots{}", acknowledgement = ack-nhfb, articleno = "42", fjournal = "Proceedings of the ACM on Measurement and Analysis of Computing Systems (POMACS)", journal-URL = "https://dl.acm.org/loi/pomacs", } @Article{Thebault:2018:AMC, author = "Lo{\"\i}c Th{\'e}bault and Eric Petit", title = "Asynchronous and multithreaded communications on irregular applications using vectorized divide and conquer approach", journal = j-J-PAR-DIST-COMP, volume = "114", number = "??", pages = "16--27", month = apr, year = "2018", CODEN = "JPDCER", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Tue Feb 6 13:52:05 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.sciencedirect.com/science/article/pii/S0743731517303350", acknowledgement = ack-nhfb, fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315", } @Article{Tian:2018:RSP, author = "Zhenzhou Tian and Ting Liu and Qinghua Zheng and Eryue Zhuang and Ming Fan and Zijiang Yang", title = "Reviving Sequential Program Birthmarking for Multithreaded Software Plagiarism Detection", journal = j-IEEE-TRANS-SOFTW-ENG, volume = "44", number = "5", pages = "491--511", month = may, year = "2018", CODEN = "IESEDJ", DOI = "https://doi.org/10.1109/TSE.2017.2688383", ISSN = "0098-5589 (print), 1939-3520 (electronic)", ISSN-L = "0098-5589", bibdate = "Thu Jun 14 08:43:22 MDT 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranssoftweng2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "https://ieeexplore.ieee.org/document/7888597/", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Software Engineering", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=32", } @Book{Troutwine:2018:HCR, author = "Brian L. Troutwine", title = "Hands-on Concurrency with {Rust}: Confidently Build Memory-safe, Parallel, and Efficient Software in {Rust}", publisher = pub-PACKT, address = pub-PACKT:adr, pages = "v + 449", year = "2018", ISBN = "1-78839-997-8 (paperback), 1-78847-835-5", ISBN-13 = "978-1-78839-997-5 (paperback), 978-1-78847-835-9", LCCN = "QA76.76.A65", bibdate = "Tue Dec 10 05:53:29 MST 2019", bibsource = "fsz3950.oclc.org:210/WorldCat; https://www.math.utah.edu/pub/tex/bib/master.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://proquest.safaribooksonline.com/?fpi=9781788399975", abstract = "Get to grips with modern software demands by learning the effective uses of Rust's powerful memory safety.Key Features Learn and improve the sequential performance characteristics of your software Understand the use of operating system processes in a high-scale concurrent system Learn of the various coordination methods available in the Standard library. Most programming languages can really complicate things, especially with regard to unsafe memory access. The burden on you, the programmer, lies across two domains: understanding the modern machine and your language's pain-points. This book will teach you to how to manage program performance on modern machines and build fast, memory-safe, and concurrent software in Rust. It starts with the fundamentals of Rust and discusses machine architecture concepts. You will be taken through ways to measure and improve the performance of Rust code systematically and how to write collections with confidence. You will learn about the Sync and Send traits applied to threads, and coordinate thread execution with locks, atomic primitives, data-parallelism, and more.The book will show you how to efficiently embed Rust in C++ code and explore the functionalities of various crates for multithreaded applications. It explores implementations in depth. You will know how a mutex works and build several yourself. You will master radically different approaches that exist in the ecosystem for structuring and managing high-scale systems. By the end of the book, you will feel comfortable with designing safe, consistent, parallel, and high-performance applications in Rust.What you will learn Probe your programs for performance and accuracy issues Create your own threading and multi-processing environment in Rust Use coarse locks from Rust's Standard library Solve common synchronization problems or avoid synchronization using atomic programming Build lock-free/wait-free structures in Rust and understand their implementations in the crates ecosystem Leverage Rust's memory model and type system to build safety properties into your parallel programs Understand the new features of the Rust programming language to ease the writing of parallel programs. Who this book is for. This book is aimed at software engineers with a basic understanding of Rust who want to exploit the parallel and concurrent nature of modern computing environments, safely.", acknowledgement = ack-nhfb, libnote = "Not in my library.", subject = "Application software; Development; Computer multitasking; Programming languages (Electronic computers); Portable and handheld devices: consumer/user guides; Mobile phones: consumer/user guides; Parallel processing; Programming and scripting languages: general; Computers; Programming; Parallel; Hardware; Handheld Devices; Programming Languages; C; Development; Computer multitasking; Programming languages (Electronic computers)", } @Article{Wang:2018:SPE, author = "Wenjun Wang and Wei-Ming Lin", title = "System performance enhancement with thread suspension for simultaneous multi-threading processors", journal = j-INT-J-COMPUT-APPL, volume = "42", number = "8", pages = "774--786", year = "2018", CODEN = "IJCAFW", DOI = "https://doi.org/10.1080/1206212X.2018.1489572", ISSN = "1206-212X (print), 1925-7074 (electronic)", ISSN-L = "1206-212X", bibdate = "Fri Apr 12 14:06:55 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/ijca.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.tandfonline.com/doi/full/10.1080/1206212X.2018.1489572", acknowledgement = ack-nhfb, ajournal = "Int. J. Comput. Appl.", fjournal = "International Journal of Computers and Applications", journal-URL = "https://www.tandfonline.com/loi/tjca20", } @Article{Wang:2018:TWB, author = "Jui-Hsien Wang and Ante Qu and Timothy R. Langlois and Doug L. James", title = "Toward wave-based sound synthesis for computer animation", journal = j-TOG, volume = "37", number = "4", pages = "109:1--109:??", month = aug, year = "2018", CODEN = "ATGRDF", DOI = "https://doi.org/10.1145/3197517.3201318", ISSN = "0730-0301 (print), 1557-7368 (electronic)", ISSN-L = "0730-0301", bibdate = "Thu Nov 29 17:19:43 MST 2018", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/tog.bib", abstract = "We explore an integrated approach to sound generation that supports a wide variety of physics-based simulation models and computer-animated phenomena. Targeting high-quality offline sound synthesis, we seek to resolve animation-driven sound radiation with near-field scattering and diffraction effects. The core of our approach is a sharp-interface finite-difference time-domain (FDTD) wavesolver, with a series of supporting algorithms to handle rapidly deforming and vibrating embedded interfaces arising in physics-based animation sound. Once the solver rasterizes these interfaces, it must evaluate acceleration boundary conditions (BCs) that involve model-and phenomena-specific computations. We introduce acoustic shaders as a mechanism to abstract away these complexities, and describe a variety of implementations for computer animation: near-rigid objects with ringing and acceleration noise, deformable (finite element) models such as thin shells, bubble-based water, and virtual characters. Since time-domain wave synthesis is expensive, we only simulate pressure waves in a small region about each sound source, then estimate a far-field pressure signal. To further improve scalability beyond multi-threading, we propose a fully time-parallel sound synthesis method that is demonstrated on commodity cloud computing resources. In addition to presenting results for multiple animation phenomena (water, rigid, shells, kinematic deformers, etc.) we also propose 3D automatic dialogue replacement (3DADR) for virtual characters so that pre-recorded dialogue can include character movement, and near-field shadowing and scattering sound effects.", acknowledgement = ack-nhfb, articleno = "109", fjournal = "ACM Transactions on Graphics", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J778", } @Article{Abdulla:2019:OSM, author = "Parosh Aziz Abdulla and Mohamed Faouzi Atig and Bengt Jonsson and Magnus L{\aa}ng and Tuan Phong Ngo and Konstantinos Sagonas", title = "Optimal stateless model checking for reads-from equivalence under sequential consistency", journal = j-PACMPL, volume = "3", number = "OOPSLA", pages = "150:1--150:29", month = oct, year = "2019", DOI = "https://doi.org/10.1145/3360576", bibdate = "Fri Aug 7 19:22:30 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/pacmpl.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3360576", abstract = "We present a new approach for stateless model checking (SMC) of multithreaded programs under Sequential Consistency (SC) semantics. To combat state-space explosion, SMC is often equipped with a partial-order reduction technique, which defines an \ldots{}", acknowledgement = ack-nhfb, articleno = "150", fjournal = "Proceedings of the ACM on Programming Languages", journal-URL = "https://pacmpl.acm.org/", } @Article{Amestoy:2019:PSB, author = "Patrick R. Amestoy and Alfredo Buttari and Jean-Yves L'Excellent and Theo Mary", title = "Performance and Scalability of the Block Low-Rank Multifrontal Factorization on Multicore Architectures", journal = j-TOMS, volume = "45", number = "1", pages = "2:1--2:26", month = mar, year = "2019", CODEN = "ACMSCU", DOI = "https://doi.org/10.1145/3242094", ISSN = "0098-3500 (print), 1557-7295 (electronic)", ISSN-L = "0098-3500", bibdate = "Mon May 6 18:23:42 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/toms.bib", URL = "https://dl.acm.org/citation.cfm?id=3242094", abstract = "Matrices coming from elliptic Partial Differential Equations have been shown to have a low-rank property that can be efficiently exploited in multifrontal solvers to provide a substantial reduction of their complexity. Among the possible low-rank formats, the Block Low-Rank format (BLR) is easy to use in a general purpose multifrontal solver and its potential compared to standard (full-rank) solvers has been demonstrated. Recently, new variants have been introduced and it was proved that they can further reduce the complexity but their performance has never been analyzed. In this article, we present a multithreaded BLR factorization and analyze its efficiency and scalability in shared-memory multicore environments. We identify the challenges posed by the use of BLR approximations in multifrontal solvers and put forward several algorithmic variants of the BLR factorization that overcome these challenges by improving its efficiency and scalability. We illustrate the performance analysis of the BLR multifrontal factorization with numerical experiments on a large set of problems coming from a variety of real-life applications.", acknowledgement = ack-nhfb, articleno = "2", fjournal = "ACM Transactions on Mathematical Software (TOMS)", journal-URL = "http://dl.acm.org/pub.cfm?id=J782", } @Article{Areias:2019:MDL, author = "Miguel Areias and Ricardo Rocha", title = "Multi-dimensional lock-free arrays for multithreaded mode-directed tabling in {Prolog}", journal = j-CCPE, volume = "31", number = "5", pages = "e4491:1--e4491:??", day = "10", month = mar, year = "2019", CODEN = "CCPEBO", DOI = "https://doi.org/10.1002/cpe.4491", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Thu Mar 28 08:07:55 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ccpe.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Concurrency and Computation: Practice and Experience", journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626", onlinedate = "30 March 2018", } @Article{Asyabi:2019:COS, author = "Esmail Asyabi and Erfan Sharafzadeh and SeyedAlireza SanaeeKohroudi and Mohsen Sharifi", title = "{CTS}: an operating system {CPU} scheduler to mitigate tail latency for latency-sensitive multi-threaded applications", journal = j-J-PAR-DIST-COMP, volume = "133", number = "??", pages = "232--243", month = nov, year = "2019", CODEN = "JPDCER", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Fri Sep 13 10:25:21 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.sciencedirect.com/science/article/pii/S0743731518302387", acknowledgement = ack-nhfb, fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315", } @Article{Bajczi:2019:WMP, author = "Levente Bajczi and Andr{\'a}s V{\"o}r{\"o}s and Vince Moln{\'a}r", title = "Will My Program Break on This Faulty Processor?: {Formal} Analysis of Hardware Fault Activations in Concurrent Embedded Software", journal = j-TECS, volume = "18", number = "5s", pages = "89:1--89:??", month = oct, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3358238", ISSN = "1539-9087 (print), 1558-3465 (electronic)", ISSN-L = "1539-9087", bibdate = "Thu Oct 17 18:16:44 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/tecs.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3358238", abstract = "Formal verification is approaching a point where it will be reliably applicable to embedded software. Even though formal verification can efficiently analyze multi-threaded applications, multi-core processors are often considered too dangerous to use in critical systems, despite the many benefits they can offer. One reason is the advanced memory consistency model of such CPUs. Nowadays, most software verifiers assume strict sequential consistency, which is also the na{\"\i}ve view of programmers. Modern multi-core processors, however, rarely guarantee this assumption by default. In addition, complex processor architectures may easily contain design faults. Thanks to the recent advances in hardware verification, these faults are increasingly visible and can be detected even in existing processors, giving an opportunity to compensate for the problem in software. In this paper, we propose a generic approach to consider inconsistent behavior of the hardware in the analysis of software. Our approach is based on formal methods and can be used to detect the activation of existing hardware faults on the application level and facilitate their mitigation in software. The approach relies heavily on recent results of model checking and hardware verification and offers new, integrative research directions. We propose a partial solution based on existing model checking tools to demonstrate feasibility and evaluate their performance in this context.", acknowledgement = ack-nhfb, articleno = "89", fjournal = "ACM Transactions on Embedded Computing Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J840", } @Article{Balkind:2019:OOS, author = "Jonathan Balkind and Michael McKeown and Yaosheng Fu and Tri Nguyen and Yanqi Zhou and Alexey Lavrov and Mohammad Shahrad and Adi Fuchs and Samuel Payne and Xiaohua Liang and Matthew Matl and David Wentzlaff", title = "{OpenPiton}: an open source hardware platform for your research", journal = j-CACM, volume = "62", number = "12", pages = "79--87", month = dec, year = "2019", CODEN = "CACMA2", DOI = "https://doi.org/10.1145/3366343", ISSN = "0001-0782 (print), 1557-7317 (electronic)", ISSN-L = "0001-0782", bibdate = "Mon Nov 25 09:55:53 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/cacm2010.bib; https://www.math.utah.edu/pub/tex/bib/gnu.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "https://cacm.acm.org/magazines/2019/12/241058/fulltext", abstract = "Industry is building larger, more complex, manycore processors on the back of strong institutional knowledge, but academic projects face difficulties in replicating that scale. To alleviate these difficulties and to develop and share knowledge, the community needs open architecture frameworks for simulation, chip design, and software exploration that support extensibility, scalability, and configurability, alongside an established base of verification tools and supported software. In this article, we present OpenPiton, an open source framework for building scalable architecture research prototypes from one core to 500 million cores. OpenPiton is the world's first open source, general-purpose, multithreaded manycore processor, and framework. OpenPiton is highly configurable, providing a rich design space spanning a variety of hardware parameters that researchers can change. OpenPiton designs can be emulated on FPGAs, where they can run full-stack multiuser Debian Linux. OpenPiton is designed to scale to very large core fabrics, enabling researchers to measure operating system, compiler, and software scalability. The mature code-base reflects the complexity of an industrial-grade design and provides the necessary scripts to build new chips, making OpenPiton a natural choice for computer-aided design (CAD) research. OpenPiton has been validated with a 25-core chip prototype, named Piton, and is bolstered by a validation suite that has thousands of tests, providing an environment to test new hardware designs while verifying the correctness of the whole system. OpenPiton is being actively used in research both internally to Princeton and in the wider community, as well as being adopted in education, industry, and government settings.", acknowledgement = ack-nhfb, fjournal = "Communications of the ACM", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J79", } @Article{Bonizzoni:2019:MMB, author = "Paola Bonizzoni and Gianluca Della Vedova and Yuri Pirola and Marco Previtali and Raffaella Rizzi", title = "Multithread Multistring {Burrows--Wheeler} Transform and Longest Common Prefix Array", journal = j-J-COMPUT-BIOL, volume = "26", number = "9", pages = "948--961", month = sep, year = "2019", CODEN = "JCOBEM", DOI = "https://doi.org/10.1089/cmb.2018.0230", ISSN = "1066-5277 (print), 1557-8666 (electronic)", ISSN-L = "1066-5277", bibdate = "Tue Oct 8 06:02:58 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jcomputbiol.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "https://www.liebertpub.com/doi/abs/10.1089/cmb.2018.0230; https://www.liebertpub.com/doi/pdf/10.1089/cmb.2018.0230", acknowledgement = ack-nhfb, fjournal = "Journal of Computational Biology", journal-URL = "https://www.liebertpub.com/loi/cmb/", onlinedate = "29 May 2019", } @Article{Bouksiaa:2019:UDE, author = "M. S. M. Bouksiaa and F. Trahay and A. Lescouet and G. Voron and R. Dulong and A. Guermouche and {\'E}. Brunet and G. Thomas", title = "Using Differential Execution Analysis to Identify Thread Interference", journal = j-IEEE-TRANS-PAR-DIST-SYS, volume = "30", number = "12", pages = "2866--2878", month = dec, year = "2019", CODEN = "ITDSEO", DOI = "https://doi.org/10.1109/TPDS.2019.2927481", ISSN = "1045-9219 (print), 1558-2183 (electronic)", ISSN-L = "1045-9219", bibdate = "Thu Dec 19 09:20:35 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Parallel and Distributed Systems", journal-URL = "http://www.computer.org/portal/web/csdl/transactions/tpds", keywords = "bottleneck detection; Energy storage; Generators; multithreading; Performance analysis; Power system stability; Real-time systems; Renewable energy sources; Supply and demand", } @Article{Brais:2019:AAM, author = "Hadi Brais and Preeti Ranjan Panda", title = "{Alleria}: an Advanced Memory Access Profiling Framework", journal = j-TECS, volume = "18", number = "5s", pages = "81:1--81:??", month = oct, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3358193", ISSN = "1539-9087 (print), 1558-3465 (electronic)", ISSN-L = "1539-9087", bibdate = "Thu Oct 17 18:16:44 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/tecs.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3358193", abstract = "Application analysis and simulation tools are used extensively by embedded system designers to improve existing optimization techniques or develop new ones. We propose the Alleria framework to make it easier for designers to comprehensively collect critical information such as virtual and physical memory addresses, accessed values, and thread schedules about one or more target applications. Such profilers often incur substantial performance overheads that are orders of magnitude larger than native execution time. We discuss how that overhead can be significantly reduced using a novel profiling mechanism called adaptive profiling. We develop a heuristic-based adaptive profiling mechanism and evaluate its performance using single-threaded and multi-threaded applications. The proposed technique can improve profiling throughput by up to 145\% and by 37\% on an average, enabling Alleria to be used to comprehensively profile applications with a throughput of over 3 million instructions per second.", acknowledgement = ack-nhfb, articleno = "81", fjournal = "ACM Transactions on Embedded Computing Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J840", } @Article{Budhkar:2019:AMD, author = "Prerna Budhkar and Ildar Absalyamov and Vasileios Zois and Skyler Windh and Walid A. Najjar and Vassilis J. Tsotras", title = "Accelerating In-Memory Database Selections Using Latency Masking Hardware Threads", journal = j-TACO, volume = "16", number = "2", pages = "13:1--13:??", month = may, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3310229", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jul 26 14:25:54 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Inexpensive DRAMs have created new opportunities for in-memory data analytics. However, the major bottleneck in such systems is high memory access latency. Traditionally, this problem is solved with large cache hierarchies that only benefit regular applications. Alternatively, many data-intensive applications exhibit irregular behavior. Hardware multithreading can better cope with high latency seen in such applications. This article implements a multithreaded prototype (MTP) on FPGAs for the relational selection operator that exhibits control flow irregularity. On a standard TPC-H query evaluation, MTP achieves a bandwidth utilization of 83\%, while the CPU and the GPU implementations achieve 61\% and 64\%, respectively. Besides being bandwidth efficient, MTP is also $ 14.2 \times $ and $ 4.2 \times $ more power efficient than CPU and GPU, respectively.", acknowledgement = ack-nhfb, articleno = "13", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924", } @Article{Carroll:2019:ACM, author = "Shane Carroll and Wei-ming Lin", title = "Applied On-Chip Machine Learning for Dynamic Resource Control in Multithreaded Processors", journal = j-PARALLEL-PROCESS-LETT, volume = "29", number = "03", pages = "??--??", month = sep, year = "2019", DOI = "https://doi.org/10.1142/S0129626419500130", ISSN = "0129-6264 (print), 1793-642X (electronic)", ISSN-L = "0129-6264", bibdate = "Mon Mar 29 12:30:09 MDT 2021", bibsource = "http://ejournals.wspc.com.sg/ppl/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/parallelprocesslett.bib", URL = "https://www.worldscientific.com/doi/10.1142/S0129626419500130", abstract = "In this paper, we propose a machine learning algorithm to control instruction fetch bandwidth in a simultaneous multithreaded CPU. In a simultaneous multithreaded CPU, multiple threads occupy pools of hardware resources in the same clock cycle. Under some conditions, one or more threads may undergo a period of inefficiency, e.g., a cache miss, thereby inefficiently using shared resources and degrading the performance of other threads. If these inefficiencies can be identified at runtime, the offending thread can be temporarily blocked from fetching new instructions into the pipeline and given time to recover from its inefficiency, and prevent the shared system resources from being wasted on a stalled thread. In this paper, we propose a machine learning approach to determine when a thread should be blocked from fetching new instructions. The model is trained offline and the parameters embedded in a CPU, which can be queried with runtime statistics to determine if a thread is running inefficiently and should be temporarily blocked from fetching. We propose two models: a simple linear model and a higher-capacity neural network. We test each model in a simulation environment and show that system performance can increase by up to 19\% on average with a feasible implementation of the proposed algorithm.", acknowledgement = ack-nhfb, fjournal = "Parallel Processing Letters", journal-URL = "http://www.worldscientific.com/loi/ppl", } @Article{Carroll:2019:RRT, author = "Shane Carroll and Wei-ming Lin", title = "Round Robin Thread Selection Optimization in Multithreaded Processors", journal = j-PARALLEL-PROCESS-LETT, volume = "29", number = "01", pages = "??--??", month = mar, year = "2019", DOI = "https://doi.org/10.1142/S0129626419500038", ISSN = "0129-6264 (print), 1793-642X (electronic)", ISSN-L = "0129-6264", bibdate = "Mon Mar 29 12:30:06 MDT 2021", bibsource = "http://ejournals.wspc.com.sg/ppl/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/parallelprocesslett.bib", URL = "https://www.worldscientific.com/doi/10.1142/S0129626419500038", abstract = "We propose a variation of round-robin ordering in an multi-threaded pipeline to increase system throughput and resource distribution fairness. We show that using round robin with a typical arbitrary ordering results in inefficient use of shared resources and subsequent thread starvation. To address this but still use a simple round-robin approach, we optimally and dynamically sort the order of the round robin periodically at runtime. We show that with 4-threaded workloads, throughput can be improved by over 9\% and harmonic throughput by over 3\% by sorting thread order at run time. We experiment with multiple stages of the pipeline and show consistent results throughout several experiments using the SPEC CPU 2006 benchmarks. Furthermore, since the technique is still a simple round robin, the increased performance requires little overhead to implement.", acknowledgement = ack-nhfb, fjournal = "Parallel Processing Letters", journal-URL = "http://www.worldscientific.com/loi/ppl", } @Article{Fraguela:2019:EDP, author = "B. B. Fraguela and D. Andrade", title = "Easy Dataflow Programming in Clusters with {UPC++} {DepSpawn}", journal = j-IEEE-TRANS-PAR-DIST-SYS, volume = "30", number = "6", pages = "1267--1282", month = jun, year = "2019", CODEN = "ITDSEO", DOI = "https://doi.org/10.1109/TPDS.2018.2884716", ISSN = "1045-9219 (print), 1558-2183 (electronic)", ISSN-L = "1045-9219", bibdate = "Fri Aug 30 06:09:58 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Parallel and Distributed Systems", journal-URL = "http://www.computer.org/portal/web/csdl/transactions/tpds", keywords = "application program interfaces; arbitrarily complex task-parallel codes; Arrays; C++ languages; data flow analysis; dataflow; dataflow approach; distributed memory; distributed memory systems; easy dataflow programming; Electronics packaging; host language; implied uncertainties; interoperability; Libraries; message passing; multi-threading; multithreading; parallel processing; parallel programming; parallel programming models; partitioned global address space programming model; PGAS libraries; PGAS UPC++ library; programmability; Programming; Proposals; relevant proposals; software libraries; Task analysis; traditional message-passing paradigm; UPC++ DepSpawn", } @Article{Gueunet:2019:TBA, author = "C. Gueunet and P. Fortin and J. Jomier and J. Tierny", title = "Task-Based Augmented Contour Trees with {Fibonacci} Heaps", journal = j-IEEE-TRANS-PAR-DIST-SYS, volume = "30", number = "8", pages = "1889--1905", month = aug, year = "2019", CODEN = "ITDSEO", DOI = "https://doi.org/10.1109/TPDS.2019.2898436", ISSN = "1045-9219 (print), 1558-2183 (electronic)", ISSN-L = "1045-9219", bibdate = "Fri Aug 30 06:09:58 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/fibquart.bib; https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Parallel and Distributed Systems", journal-URL = "http://www.computer.org/portal/web/csdl/transactions/tpds", keywords = "computation procedure; contour tree based applications; Data analysis; data segmentation applications; data structures; Data structures; data visualisation; Data visualization; fast shared memory; Fibonacci heaps; independent local tasks; intermediate data structures; join split trees; multi-core architecture; multi-threading; multicore computation; OpenMP task runtime; parallel algorithm; parallel algorithms; Parallel algorithms; parallel thanks; Runtime; Scientific visualization; Task analysis; task parallelism; task-based augmented contour trees; topological data analysis; tree algorithm; trees (mathematics)", } @Article{Herdt:2019:CSB, author = "Vladimir Herdt and Hoang M. Le and Daniel Gro{\ss}e and Rolf Drechsler", title = "Combining sequentialization-based verification of multi-threaded {C} programs with symbolic {Partial Order Reduction}", journal = j-INT-J-SOFTW-TOOLS-TECHNOL-TRANSFER, volume = "21", number = "5", pages = "545--565", month = oct, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1007/s10009-019-00507-5", ISSN = "1433-2779 (print), 1433-2787 (electronic)", ISSN-L = "1433-2779", bibdate = "Fri Oct 11 15:05:00 MDT 2019", bibsource = "http://link.springer.com/journal/10009/21/5; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sttt.bib", URL = "https://link.springer.com/article/10.1007/s10009-019-00507-5", acknowledgement = ack-nhfb, fjournal = "International Journal on Software Tools for Technology Transfer (STTT)", journal-URL = "http://link.springer.com/journal/10009", } @Article{Iliakis:2019:LIG, author = "Konstantinos Iliakis and Sotirios Xydis and Dimitrios Soudris", title = "{LOOG}: Improving {GPU} Efficiency With Light-Weight Out-Of-Order Execution", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "18", number = "2", pages = "166--169", month = jul, year = "2019", DOI = "https://doi.org/10.1109/LCA.2019.2951161", ISSN = "1556-6064", ISSN-L = "1556-6056", bibdate = "Thu May 27 16:19:32 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "GPUs are one of the most prevalent platforms for accelerating general-purpose workloads due to their intuitive programming model, computing capacity, and cost-effectiveness. GPUs rely on massive multi-threading and fast context switching to overlap computations with memory operations. Among the diverse GPU workloads, there exists a class of kernels that fail to maintain a sufficient number of active warps to hide the latency of memory operations, and thus suffer from frequent stalling. We observe that these kernels will benefit from increased levels of Instruction-Level Parallelism and we propose a novel architecture with lightweight Out-Of-Order execution capability. To minimize hardware overheads, we carefully design our extension to highly re-use the existing micro-architectural structures. We show that the proposed architecture outperforms traditional platforms by 15 to 46 percent on average for low occupancy kernels, with an area overhead of 0.74 to 3.94 percent. Finally, we prove the potential of our proposal as a GPU u-arch alternative, by providing a 5 percent speedup over a wide collection of 63 general-purpose kernels with as little as 0.74 percent area overhead.", acknowledgement = ack-nhfb, fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Copper; GPGPU; Graphics processing units; Kernel; micro-architecture; Out of order; Out-of-Order execution; Radio access technologies; Radio frequency; Registers", } @Article{Jia:2019:UPD, author = "Z. Jia and W. Gao and Y. Shi and S. A. McKee and Z. Ji and J. Zhan and L. Wang and L. Zhang", title = "Understanding Processors Design Decisions for Data Analytics in Homogeneous Data Centers", journal = j-IEEE-TRANS-BIG-DATA, volume = "5", number = "1", pages = "81--94", month = mar, year = "2019", DOI = "https://doi.org/10.1109/TBDATA.2017.2758792", ISSN = "2332-7790", bibdate = "Fri Aug 2 11:24:47 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetransbigdata.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Big Data", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=6687317", keywords = "Big Data; big data; brawny multicore processors; Clocks; computational performance; computer centres; Data analysis; data analysis; Data analytics; data analytics workloads; data center systems; energy conservation; energy efficiency; energy-efficiency; homogeneous data centers; many-core processors; multi-threading; Multicore processing; multiprocessing systems; performance; performance-cost efficiency; Pipelines; power aware computing; processor design decisions; processor evaluation; Program processors; simultaneous multithreading", } @Book{Klabnik:2019:RPL, author = "Steve Klabnik and Carol Nichols", title = "The {Rust} programming language", publisher = pub-NO-STARCH, address = pub-NO-STARCH:adr, edition = "Second", pages = "xxix + 526", year = "2019", ISBN = "1-09-812253-4, 1-71850-044-0 (paperback)", ISBN-13 = "978-1-09-812253-9, 978-1-71850-044-0 (paperback)", LCCN = "QA76.73.R87", bibdate = "Fri Nov 8 05:59:02 MST 2019", bibsource = "fsz3950.oclc.org:210/WorldCat; https://www.math.utah.edu/pub/tex/bib/master.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://proquest.safaribooksonline.com/?fpi=9781098122539; https://nostarch.com/download/samples/RustProgrammingLanguage2018_Sample_ToC.pdf; https://nostarch.com/Rust2018", abstract = "\booktitle{The Rust Programming Language} is the official book on Rust: an open source systems programming language that helps you write faster, more reliable software. Rust offers control over low-level details (such as memory usage) in combination with high-level ergonomics, eliminating the hassle traditionally associated with low-level languages. The authors of \booktitle{The Rust Programming Language}, members of the Rust Core Team, share their knowledge and experience to show you how to take full advantage of Rust's features-from installation to creating robust and scalable programs. You'll begin with basics like creating functions, choosing data types, and binding variables and then move on to more advanced concepts, such as: * Ownership and borrowing, lifetimes, and traits * Using Rust's memory safety guarantees to build fast, safe programs; * Testing, error handling, and effective refactoring; * Generics, smart pointers, multithreading, trait objects, and advanced pattern matching; * Using Cargo, Rust's built-in package manager, to build, test, and document your code and manage dependencies; * How best to use Rust's advanced compiler with compiler-led programming techniques You'll find plenty of code examples throughout the book, as well as three chapters dedicated to building complete projects to test your learning: a number guessing game, a Rust implementation of a command line tool, and a multithreaded server. New to this edition: An extended section on Rust macros, an expanded chapter on modules, and appendixes on Rust development tools and editions.", acknowledgement = ack-nhfb, libnote = "Not in my library.", subject = "Rust (Computer program language); Computer programming; Computer programming.; Rust (Computer program language)", tableofcontents = "1: Getting started \\ 2: Programming a guessing game \\ 3: Common programming concepts \\ 4: Understanding ownership \\ 5: Using structs to structure related data \\ 6: Enums and pattern matching \\ 7: Managing growing projects with packages, crates, and modules \\ 8: Common collections \\ 9: Error handling \\ 10: Generic types, traits, and lifetimes \\ 11: Writing automated tests \\ 12: An I/O project: building a command line program \\ 13: Functional language features: iterators and closures \\ 14: More about Cargo and Crates.io \\ 15: Smart pointers \\ 16: Fearless concurrency \\ 17: Object-oriented programming features of Rust \\ 18: Patterns and matching \\ 19: Advanced features \\ 20: Final project: building a multithreaded web server \\ Appendix A: Keywords \\ Appendix B: Operators and Symbols \\ Appendix C: Derivable Traits \\ Appendix D: Useful Development Tools \\ Appendix E: Editions \\ Index", } @TechReport{Laguna:2019:GPD, author = "Ignacio Laguna and Paul C. Wood and Ranvijay Singh and Saurabh Bagchi", title = "{GPUMixer}: Performance-Driven Floating-Point Tuning for {GPU} Scientific Applications", type = "Report", institution = "Lawrence Livermore National Laboratory", address = "Livermore CA 94550, USA", year = "2019", bibdate = "Tue Aug 06 05:54:23 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/pvm.bib", URL = "http://lagunaresearch.org/docs/isc-2019.pdf; https://www.hpcwire.com/2019/08/05/llnl-purdue-researchers-harness-gpu-mixed-precision-for-accuracy-performance-tradeoff/", abstract = "We present GPUMixer, a tool to perform mixed-precision floating-point tuning on scientific GPU applications. While precision tuning techniques are available, they are designed for serial programs and are accuracy-driven, i.e., they consider configurations that satisfy accuracy constraints, but these configurations may degrade performance. GPUMixer, in contrast, presents a performance-driven approach for tuning. We introduce a novel static analysis that finds Fast Imprecise Sets (FISets), sets of operations on low precision that minimize type conversions, which often yield performance speedups. To estimate the relative error introduced by GPU mixed-precision, we propose shadow computations analysis for GPUs, the first of this class for multi-threaded applications. GPUMixer obtains performance improvements of up to 46.4\% of the ideal speedup in comparison to only 20.7\% found by state-of-the-art methods.", acknowledgement = ack-nhfb, remark = "Best paper award at the 33rd ISC High Performance conference held June 16--20, 2019.", } @Article{Li:2019:HSG, author = "Yuxiang Li and Yinliang Zhao and Liyu Sun and Mengjuan Shen", title = "A hybrid sample generation approach in speculative multithreading", journal = j-J-SUPERCOMPUTING, volume = "75", number = "8", pages = "4193--4225", month = aug, year = "2019", CODEN = "JOSUED", DOI = "https://doi.org/10.1007/s11227-017-2118-3", ISSN = "0920-8542 (print), 1573-0484 (electronic)", ISSN-L = "0920-8542", bibdate = "Thu Oct 10 15:31:22 MDT 2019", bibsource = "http://link.springer.com/journal/11227/75/8; https://www.math.utah.edu/pub/tex/bib/jsuper.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "The Journal of Supercomputing", journal-URL = "http://link.springer.com/journal/11227", } @Article{Li:2019:SRM, author = "Y. Li and K. Nomura and J. A. Insley and V. Morozov and K. Kumaran and N. A. Romero and W. A. Goddard and R. K. Kalia and A. Nakano and P. Vashishta", title = "Scalable Reactive Molecular Dynamics Simulations for Computational Synthesis", journal = j-COMPUT-SCI-ENG, volume = "21", number = "5", pages = "64--75", month = sep, year = "2019", CODEN = "CSENFA", DOI = "https://doi.org/10.1109/MCSE.2018.110150043", ISSN = "1521-9615 (print), 1558-366x (electronic)", ISSN-L = "1521-9615", bibdate = "Mon Aug 19 06:40:58 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/computscieng.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Computing in Science and Engineering", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5992", keywords = "Computational modeling; Computer science; computer system implementation mathematics of computing; computing methodologies; data; general; large and medium ( mainframe ) computers; Materials science and technology; Mathematical model; modeling and prediction; Multithreading; numerical analysis; Numerical models; operating systems; parallel algorithms; performance; Predictive models; simulation theory; simulation, modeling, and visualization; software; software engineering; super (very large) computers; system applications and experience; theory of computation; types of simulation", } @Article{Li:2019:TBH, author = "Bing Li and Mengjie Mao and Xiaoxiao Liu and Tao Liu and Zihao Liu and Wujie Wen and Yiran Chen and Hai (Helen) Li", title = "Thread Batching for High-performance Energy-efficient {GPU} Memory Design", journal = j-JETC, volume = "15", number = "4", pages = "39:1--39:??", month = dec, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3330152", ISSN = "1550-4832", bibdate = "Tue Dec 17 07:50:24 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/pvm.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3330152", abstract = "Massive multi-threading in GPU imposes tremendous pressure on memory subsystems. Due to rapid growth in thread-level parallelism of GPU and slowly improved peak memory bandwidth, memory becomes a bottleneck of GPU's performance and energy efficiency. In this article, we propose an integrated architectural scheme to optimize the memory accesses and therefore boost the performance and energy efficiency of GPU. First, we propose a thread batch enabled memory partitioning (TEMP) to improve GPU memory access parallelism. In particular, TEMP groups multiple thread blocks that share the same set of pages into a thread batch and applies a page coloring mechanism to bound each stream multiprocessor (SM) to the dedicated memory banks. After that, TEMP dispatches the thread batch to an SM to ensure high-parallel memory-access streaming from the different thread blocks. Second, a thread batch-aware scheduling (TBAS) scheme is introduced to improve the GPU memory access locality and to reduce the contention on memory controllers and interconnection networks. Experimental results show that the integration of TEMP and TBAS can achieve up to 10.3\% performance improvement and 11.3\% DRAM energy reduction across diverse GPU applications. We also evaluate the performance interference of the mixed CPU+GPU workloads when they are run on a heterogeneous system that employs our proposed schemes. Our results show that a simple solution can effectively ensure the efficient execution of both GPU and CPU applications.", acknowledgement = ack-nhfb, articleno = "39", fjournal = "ACM Journal on Emerging Technologies in Computing Systems (JETC)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967", } @Article{Mironov:2019:MPE, author = "Vladimir Mironov and Yuri Alexeev and Dmitri G. Fedorov", title = "Multithreaded parallelization of the energy and analytic gradient in the fragment molecular orbital method", journal = j-IJQC, volume = "119", number = "12", pages = "e25937:1--e25937:??", day = "15", month = jun, year = "2019", CODEN = "IJQCB2", DOI = "https://doi.org/10.1002/qua.25937", ISSN = "0020-7608 (print), 1097-461X (electronic)", ISSN-L = "0020-7608", bibdate = "Wed Oct 9 06:14:07 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ijqc2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "International Journal of Quantum Chemistry", journal-URL = "http://www.interscience.wiley.com/jpages/0020-7608/", onlinedate = "26 April 2019", } @Article{Noroozi:2019:BSI, author = "Ali A. Noroozi and Jaber Karimpour and Ayaz Isazadeh", title = "Bisimulation for Secure Information Flow Analysis of Multi-Threaded Programs", journal = j-MATH-COMPUT-APPL, volume = "24", number = "2", pages = "??--??", month = jun, year = "2019", CODEN = "????", DOI = "https://doi.org/10.3390/mca24020064", ISSN = "2297-8747", ISSN-L = "2297-8747", bibdate = "Sun Feb 18 06:28:36 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/math-comput-appl.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "https://www.mdpi.com/2297-8747/24/2/64", acknowledgement = ack-nhfb, ajournal = "Math. Comput. Appl.", articleno = "64", fjournal = "Mathematical and Computational Applications", journal-URL = "https://www.mdpi.com/journal/mca", } @Article{Oz:2019:SMA, author = "Isil Oz and Sanem Arslan", title = "A Survey on Multithreading Alternatives for Soft Error Fault Tolerance", journal = j-COMP-SURV, volume = "52", number = "2", pages = "27:1--27:??", month = may, year = "2019", CODEN = "CMSVAN", DOI = "https://doi.org/10.1145/3302255", ISSN = "0360-0300 (print), 1557-7341 (electronic)", ISSN-L = "0360-0300", bibdate = "Sat Aug 31 09:04:37 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/compsurv.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3302255", abstract = "Smaller transistor sizes and reduction in voltage levels in modern microprocessors induce higher soft error rates. This trend makes reliability a primary design constraint for computer systems. Redundant multithreading (RMT) makes use of parallelism in modern systems by employing thread-level time redundancy for fault detection and recovery. RMT can detect faults by running identical copies of the program as separate threads in parallel execution units with identical inputs and comparing their outputs. In this article, we present a survey of RMT implementations at different architectural levels with several design considerations. We explain the implementations in seminal papers and their extensions and discuss the design choices employed by the techniques. We review both hardware and software approaches by presenting the main characteristics and analyze the studies with different design choices regarding their strengths and weaknesses. We also present a classification to help potential users find a suitable method for their requirement and to guide researchers planning to work on this area by providing insights into the future trend.", acknowledgement = ack-nhfb, articleno = "27", fjournal = "ACM Computing Surveys", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J204", } @Article{Roth:2019:AOC, author = "{\'A}goston R{\'o}th", title = "Algorithm 992: An {OpenGL}- and {C++}-based Function Library for Curve and Surface Modeling in a Large Class of Extended {Chebyshev} Spaces", journal = j-TOMS, volume = "45", number = "1", pages = "13:1--13:32", month = mar, year = "2019", CODEN = "ACMSCU", DOI = "https://doi.org/10.1145/3284979", ISSN = "0098-3500 (print), 1557-7295 (electronic)", ISSN-L = "0098-3500", bibdate = "Mon May 6 18:23:42 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/pvm.bib; https://www.math.utah.edu/pub/tex/bib/toms.bib", URL = "https://dl.acm.org/citation.cfm?id=3284979", abstract = "We propose a platform-independent multi-threaded function library that provides data structures to generate, differentiate, and render both the ordinary basis and the normalized B-basis of a user-specified extended Chebyshev (EC) space that comprises the constants and can be identified with the solution space of a constant-coefficient homogeneous linear differential equation defined on a sufficiently small interval. Using the obtained normalized B-bases, our library can also generate, (partially) differentiate, modify, and visualize a large family of so-called B-curves and tensor product B-surfaces. Moreover, the library also implements methods that can be used to perform dimension elevation, to subdivide B-curves and B-surfaces by means of de Casteljau-like B-algorithms, and to generate basis transformations for the B-representation of arbitrary integral curves and surfaces that are described in traditional parametric form by means of the ordinary bases of the underlying EC spaces. Independently of the algebraic, exponential, trigonometric, or mixed type of the applied EC space, the proposed library is numerically stable and efficient up to a reasonable dimension number and may be useful for academics and engineers in the fields of Approximation Theory, Computer Aided Geometric Design, Computer Graphics, and Isogeometric and Numerical Analysis.", acknowledgement = ack-nhfb, articleno = "13", fjournal = "ACM Transactions on Mathematical Software (TOMS)", journal-URL = "http://dl.acm.org/pub.cfm?id=J782", } @Article{Sabarimuthu:2019:ADC, author = "J. M. Sabarimuthu and T. G. Venkatesh", title = "Analytical Derivation of Concurrent Reuse Distance Profile for Multi-Threaded Application Running on Chip Multi-Processor", journal = j-IEEE-TRANS-PAR-DIST-SYS, volume = "30", number = "8", pages = "1704--1721", month = aug, year = "2019", CODEN = "ITDSEO", DOI = "https://doi.org/10.1109/TPDS.2019.2896633", ISSN = "1045-9219 (print), 1558-2183 (electronic)", ISSN-L = "1045-9219", bibdate = "Fri Aug 30 06:09:58 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Parallel and Distributed Systems", journal-URL = "http://www.computer.org/portal/web/csdl/transactions/tpds", keywords = "analytical model; analytical model based reuse distance prediction; Analytical models; cache memory design space; cache performance; cache storage; coherent reuse distance profile; compiler optimization; Complexity theory; concurrent reuse distance; concurrent reuse distance profile; Histograms; Instruction sets; locality analysis; Markov chain; Markov processes; Measurement; microprocessor chips; multi-core processors; multi-threaded applications; multi-threading; multicore simulator Sniper; multiprocessing systems; multithreaded application; optimisation; Performance analysis; performance analysis; probability; probability theory; Reuse distance profile; shared memory environment; simulation; standalone reuse distance profile; thread sharing", } @Book{Sengupta:2019:JHP, author = "Avik Sengupta", title = "{Julia} high performance optimizations, distributed computing, multithreading, and {GPU} programming with {Julia 1.0} and beyond", publisher = pub-PACKT, address = pub-PACKT:adr, edition = "Second", pages = "218", year = "2019", ISBN = "1-78829-230-8, 1-78829-811-X", ISBN-13 = "978-1-78829-230-6, 978-1-78829-811-7", LCCN = "????", bibdate = "Thu Apr 8 16:49:31 MDT 2021", bibsource = "fsz3950.oclc.org:210/WorldCat; https://www.math.utah.edu/pub/tex/bib/julia.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://portal.igpublish.com/iglibrary/search/PACKT0005341.html", abstract = "Julia is a high-level, high-performance dynamic programming language for numerical computing. This book will help you understand the performance characteristics of your Julia programs and achieve near-C levels of performance in Julia.", acknowledgement = ack-nhfb, subject = "Julia (Computer program language); Application software; Development; Development.; Julia (Computer program language)", tableofcontents = "Foreword \\ Contributors \\ Table of Contents \\ Preface \\ 1: Julia is Fast \\ Julia \\ fast and dynamic \\ Designed for speed \\ JIT and LLVM \\ Types, type inference, and code specialization \\ How fast can Julia be? \\ Summary \\ 2: Analyzing Performance \\ Timing Julia functions \\ The @time macro \\ Other time macros \\ The Julia profiler \\ Using the profiler \\ ProfileView \\ Using Juno for profiling \\ Using TimerOutputs \\ Analyzing memory allocation \\ Using the memory allocation tracker \\ Statistically accurate benchmarking \\ Using \pkg{BenchmarkTools.jl} \\ Summary \\ 3: Types, Type Inference, and Stability \\ The Julia type system \\ Using types \\ Multiple dispatch \\ Abstract types \\ Julia's type hierarchy \\ Composite and immutable types \\ Type parameters \\ Type inference \\ Type-stability \\ Definitions \\ Fixing type instability \\ The performance pitfalls \\ Identifying type stability \\ Loop variables \\ Kernel methods and function barriers \\ Types in storage locations \\ Arrays \\ Composite types \\ Parametric composite types \\ Summary \\ 4: Making Fast Function Calls \\ Using globals \\ The trouble with globals \\ Fixing performance issues with globals \\ Inlining \\ Default inlining \\ Controlling inlining \\ Disabling inlining \\ Constant propagation \\ Using macros for performance \\ The Julia compilation process \\ Using macros \\ Evaluating a polynomial \\ Horner's method \\ The Horner macro \\ Generated functions \\ Using generated functions \\ Using generated functions for performance \\ Using keyword arguments \\ Summary \\ 5: Fast Numbers \\ Numbers in Julia, their layout, and storage \\ Integers \\ Integer overflow \\ BigInt \\ The floating point \\ Floating point accuracy \\ Unsigned integers \\ Trading performance for accuracy \\ The @fastmath macro \\ The K-B-N summation \\ Subnormal numbers \\ Subnormal numbers to zero \\ Summary \\ 6: Using Arrays \\ Array internals in Julia \\ Array representation and storage \\ Column-wise storage \\ Adjoints \\ Array initialization \\ Bounds checking \\ Removing the cost of bounds checking \\ Configuring bound checks at startup \\ Allocations and in-place operations \\ Preallocating function output \\ sizehint! \\ Mutating functions \\ Broadcasting \\ Array views \\ SIMD parallelization (AVX2, AVX512) \\ SIMD.jl \\ Specialized array types \\ Static arrays \\ Structs of arrays \\ Yeppp!Writing generic library functions with arrays \\ Summary \\ 7: Accelerating Code with the GPU \\ Technical requirements \\ Getting started with GPUs \\ CUDA and Julia \\ CuArrays \\ Monte Carlo simulation on the GPU \\ Writing your own kernels \\ Measuring GPU performance \\ Performance tips \\ Scalar iteration \\ Combining kernels \\ Processing more data \\ Deep learning on the GPU \\ ArrayFire \\ Summary \\ 8: Concurrent Programming with Tasks \\ Tasks \\ Using tasks \\ The task life cycle \\ task\_local\_storage \\ Communicating between tasks \\ Task iteration \\ High-performance I/O", } @Article{Shea:2019:HSD, author = "Colin Shea and Tinoosh Mohsenin", title = "Heterogeneous Scheduling of Deep Neural Networks for Low-power Real-time Designs", journal = j-JETC, volume = "15", number = "4", pages = "36:1--36:??", month = dec, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3358699", ISSN = "1550-4832", bibdate = "Tue Dec 17 07:50:24 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/jetc.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/pvm.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3358699", abstract = "Deep neural networks have become the readiest answer to a range of application challenges including image recognition, stock analysis, natural language processing, and biomedical applications such as seizure detection. All while outperforming prior leading solutions that relied heavily on hand-engineered techniques. However, deployment of these neural networks often requires high-computational and memory-intensive solutions. These requirements make it challenging to deploy Deep Neural Networks (DNNs) in embedded, real-time low-power applications where classic architectures, GPUs and CPUs, still impose significant power burden. Systems-on-Chip (SoC) with Field-programmable Gate Arrays (FPGAs) can be used to improve performance and allow more fine-grain control of resources than CPUs or GPUs, but it is difficult to find the optimal balance between hardware and software to improve DNN efficiency. In the current research literature there have been few proposed solutions to address optimizing hardware and software deployments of DNNs in embedded low-power systems. To address the computation resource restriction and low-power needs for deploying these networks, we describe and implement a domain-specific metric model for optimizing task deployment on differing platforms, hardware and software. Next, we propose a DNN hardware accelerator called Scalable Low-power Accelerator for real-time deep neural Networks (SCALENet) that includes multithreaded software workers. Finally, we propose a heterogeneous aware scheduler that uses the DNN-specific metric models and the SCALENet accelerator to allocate a task to a resource based on solving a numerical cost for a series of domain objectives. To demonstrate the applicability of our contribution, we deploy nine modern deep network architectures, each containing a different number of parameters within the context of two different neural network applications: image processing and biomedical seizure detection. Utilizing the metric modeling techniques integrated into the heterogeneous aware scheduler and the SCALENet accelerator, we demonstrate the ability to meet computational requirements, adapt to multiple architectures, and lower power by providing an optimized task to resource allocation. Our heterogeneous aware scheduler improves power saving by decreasing power consumption by 10\% of the total system power, does not affect the accuracy of the networks, and still meets the real-time deadlines. We demonstrate the ability to achieve parity with or exceed the energy efficiency of NVIDIA GPUs when evaluated against Jetson TK1 with embedded GPU SoC and with a 4$ \times $ power savings in a power envelope of 2.0W. When compared to existing FPGA-based accelerators, SCALENet's accelerator and heterogeneous aware scheduler achieves a 4$ \times $ improvement in energy efficiency.", acknowledgement = ack-nhfb, articleno = "36", fjournal = "ACM Journal on Emerging Technologies in Computing Systems (JETC)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J967", } @Article{Shomron:2019:SSS, author = "G. Shomron and T. Horowitz and U. Weiser", title = "{SMT-SA}: Simultaneous Multithreading in Systolic Arrays", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "18", number = "2", pages = "99--102", month = jul, year = "2019", DOI = "https://doi.org/10.1109/LCA.2019.2924007", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Oct 1 10:18:16 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Systolic arrays (SAs) are highly parallel pipelined structures capable of executing various tasks such as matrix multiplication and convolution. They comprise a grid of usually homogeneous processing units (PUs) that are responsible for the multiply-accumulate (MAC) operations in the case of matrix multiplication. It is not rare for a PU input to be zero-valued, in which case the PU becomes idle and the array becomes underutilized. In this paper we consider a solution to employ the underutilized PUs via simultaneous multithreading (SMT). We explore the design space of a SMT-SA variant and evaluate its performance, area efficiency, and energy consumption. In addition, we suggest a tiling method to reduce area overheads. Our evaluation shows that a 4-thread FP16-based SMT-SA achieves speedups of up to $ 3.6 \times $ as compared to conventional SA, with $ 1.7 \times $ area overhead and negligible energy overhead.", acknowledgement = ack-nhfb, fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "4-thread FP16-based SMT-SA; area efficiency; Convolution; Correlation; Deep learning; Energy consumption; energy consumption; homogeneous processing units; Instruction sets; matrix multiplication; multi-threading; multiply-accumulate operations; Multithreading; multithreading; parallel pipelined structures; PU input; simultaneous multithreading; SMT-SA variant; Systolic arrays; systolic arrays; Task analysis", } @Article{Silva:2019:RFG, author = "Lucas Bragan{\c{c}}a {Da Silva} and Ricardo Ferreira and Michael Canesche and Marcelo M. Menezes and Maria D. Vieira and Jeronimo Penha and Peter Jamieson and Jos{\'e} Augusto M. Nacif", title = "{READY}: a Fine-Grained Multithreading Overlay Framework for Modern {CPU--FPGA} Dataflow Applications", journal = j-TECS, volume = "18", number = "5s", pages = "56:1--56:??", month = oct, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3358187", ISSN = "1539-9087 (print), 1558-3465 (electronic)", ISSN-L = "1539-9087", bibdate = "Thu Oct 17 18:16:44 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/tecs.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3358187", abstract = "In this work, we propose a framework called REconfigurable Accelerator DeploY (READY), the first framework to support polynomial runtime mapping of dataflow applications in high-performance CPU-FPGA platforms. READY introduces an efficient mapping with fine-grained multithreading onto an overlay architecture that hides the latency of a global interconnection network. In addition to our overlay architecture, we show how this system helps solve some of the challenges for FPGA cloud computing adoption in high-performance computing. The framework encapsulates dataflow descriptions by using a target independent, high-level API, and a dataflow model that allows for explicit spatial and temporal parallelism. READY directly maps the dataflow kernels onto the accelerator. Our tool is flexible and extensible and provides the infrastructure to explore different accelerator designs. We validate READY on the Intel Harp platform, and our experimental results show an average 2x execution runtime improvement when compared to an 8-thread multi-core processor.", acknowledgement = ack-nhfb, articleno = "56", fjournal = "ACM Transactions on Embedded Computing Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J840", } @Article{Spoto:2019:SII, author = "Fausto Spoto and Elisa Burato and Michael D. Ernst and Pietro Ferrara and Alberto Lovato and Damiano Macedonio and Ciprian Spiridon", title = "Static Identification of Injection Attacks in {Java}", journal = j-TOPLAS, volume = "41", number = "3", pages = "18:1--18:??", month = jul, year = "2019", CODEN = "ATPSDT", DOI = "https://doi.org/10.1145/3332371", ISSN = "0164-0925 (print), 1558-4593 (electronic)", ISSN-L = "0164-0925", bibdate = "Sat Nov 23 07:18:02 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/java2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/toplas.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3332371", abstract = "The most dangerous security-related software errors, according to the OWASP Top Ten 2017 list, affect web applications. They are potential injection attacks that exploit user-provided data to execute undesired operations: database access and updates ( SQL injection ); generation of malicious web pages ( cross-site scripting injection ); redirection to user-specified web pages ( redirect injection ); execution of OS commands and arbitrary scripts ( command injection ); loading of user-specified, possibly heavy or dangerous classes at run time ( reflection injection ); access to arbitrary files on the file system ( path-traversal ); and storing user-provided data into heap regions normally assumed to be shielded from the outside world ( trust boundary violation ). All these attacks exploit the same weakness: unconstrained propagation of data from sources that the user of a web application controls into sinks whose activation might trigger dangerous operations. Although web applications are written in a variety of languages, Java remains a frequent choice, in particular for banking applications, where security has tangible relevance. This article defines a unified, sound protection mechanism against such attacks, based on the identification of all possible explicit flows of tainted data in Java code. Such flows can be arbitrarily complex, passing through dynamically allocated data structures in the heap. The analysis is based on abstract interpretation and is interprocedural, flow-sensitive, and context-sensitive. Its notion of taint applies to reference (non-primitive) types dynamically allocated in the heap and is object-sensitive and field-sensitive. The analysis works by translating the program into Boolean formulas that model all possible data flows. Its implementation, within the Julia analyzer for Java and Android, found injection security vulnerabilities in the Internet banking service and in the customer relationship management of large Italian banks, as well as in a set of open-source third-party applications. It found the command injection, which is at the origin of the 2017 Equifax data breach, one of the worst data breaches ever. For objective, repeatable results, this article also evaluates the implementation on two open-source security benchmarks: the Juliet Suite and the OWASP Benchmark for the automatic comparison of static analyzers for cybersecurity. We compared this technique against more than 10 other static analyzers, both free and commercial. The result of these experiments is that ours is the only analysis for injection that is sound (up to well-stated limitations such as multithreading and native code) and works on industrial code, and it is also much more precise than other tools.", acknowledgement = ack-nhfb, articleno = "18", fjournal = "ACM Transactions on Programming Languages and Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J783", } @Article{Storey:2019:SDP, author = "Kyle Storey and Eric Mercer and Pavel Parizek", title = "A Sound Dynamic Partial Order Reduction Engine for {Java Pathfinder}", journal = j-SIGSOFT, volume = "44", number = "4", pages = "15--15", month = dec, year = "2019", CODEN = "SFENDP", DOI = "https://doi.org/10.1145/3364452.3364457", ISSN = "0163-5948 (print), 1943-5843 (electronic)", ISSN-L = "0163-5948", bibdate = "Wed Mar 24 14:07:40 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/java2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/sigsoft2010.bib", URL = "https://dl.acm.org/doi/10.1145/3364452.3364457", abstract = "When model checking a multi-threaded program, it is often necessary to enumerate the possible ordering of concurrent events to evaluate the behavior of the program. However, enumerating every possible order of events quickly leads to state-space \ldots{}", acknowledgement = ack-nhfb, fjournal = "ACM SIGSOFT Software Engineering Notes", journal-URL = "https://dl.acm.org/loi/sigsoft", } @Article{Su:2019:SSC, author = "Xing Su and Xiangke Liao and Hao Jiang and Canqun Yang and Jingling Xue", title = "{SCP}: Shared Cache Partitioning for High-Performance {GEMM}", journal = j-TACO, volume = "15", number = "4", pages = "43:1--43:??", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3274654", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:20:00 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3274654", abstract = "GEneral Matrix Multiply (GEMM) is the most fundamental computational kernel routine in the BLAS library. To achieve high performance, in-memory data must be prefetched into fast on-chip caches before they are used. Two techniques, software prefetching and data packing, have been used to effectively exploit the capability of on-chip least recent used (LRU) caches, which are popular in traditional high-performance processors used in high-end servers and supercomputers. However, the market has recently witnessed a new diversity in processor design, resulting in high-performance processors equipped with shared caches with non-LRU replacement policies. This poses a challenge to the development of high-performance GEMM in a multithreaded context. As several threads try to load data into a shared cache simultaneously, interthread cache conflicts will increase significantly. We present a Shared Cache Partitioning (SCP) method to eliminate interthread cache conflicts in the GEMM routines, by partitioning a shared cache into physically disjoint sets and assigning different sets to different threads. We have implemented SCP in the OpenBLAS library and evaluated it on Phytium 2000+, a 64-core AArch64 processor with private LRU L1 caches and shared pseudo-random L2 caches (per four-core cluster). Our evaluation shows that SCP has effectively reduced the conflict misses in both L1 and L2 caches in a highly optimized GEMM implementation, resulting in an improvement of its performance by 2.75\% to 6.91\%.", acknowledgement = ack-nhfb, articleno = "43", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J924", } @Article{Utterback:2019:POR, author = "Robert Utterback and Kunal Agrawal and I-Ting Angelina Lee and Milind Kulkarni", title = "Processor-Oblivious Record and Replay", journal = j-TOPC, volume = "6", number = "4", pages = "20:1--20:??", month = dec, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3365659", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Fri Dec 27 16:13:12 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/pvm.bib; https://www.math.utah.edu/pub/tex/bib/topc.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3365659", abstract = "Record-and-replay systems are useful tools for debugging non-deterministic parallel programs by first recording an execution and then replaying that execution to produce the same access pattern. Existing record-and-replay systems generally target thread-based execution models, and record the behaviors and interleavings of individual threads. Dynamic multithreaded languages and libraries, such as the Cilk family, OpenMP, TBB, and the like, do not have a notion of threads. Instead, these languages provide a processor-oblivious model of programming, where programs expose task parallelism using high-level constructs such as spawn/sync without regard to the number of threads/cores available to run the program. Thread-based record-and-replay would violate the processor-oblivious nature of these programs, as they incorporate the number of threads into the recorded information, constraining the replayed execution to the same number of threads. In this article, we present a processor-oblivious record-and-replay scheme for dynamic multithreaded languages where record and replay can use different number of processors and both are scheduled using work stealing. We provide theoretical guarantees for our record and replay scheme-namely that record is optimal for programs with one lock and replay is near-optimal for all cases. In addition, we implemented this scheme in the Cilk Plus runtime system and our evaluation indicates that processor-obliviousness does not cause substantial overheads.", acknowledgement = ack-nhfb, articleno = "20", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Venkataramani:2019:SMM, author = "Vanchinathan Venkataramani and Mun Choon Chan and Tulika Mitra", title = "Scratchpad-Memory Management for Multi-Threaded Applications on Many-Core Architectures", journal = j-TECS, volume = "18", number = "1", pages = "10:1--10:??", month = feb, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3301308", ISSN = "1539-9087 (print), 1558-3465 (electronic)", ISSN-L = "1539-9087", bibdate = "Thu Oct 17 18:16:42 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/tecs.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3301308", abstract = "Contemporary many-core architectures, such as Adapteva Epiphany and Sunway TaihuLight, employ per-core software-controlled Scratchpad Memory (SPM) rather than caches for better performance-per-watt and predictability. In these architectures, a core is allowed to access its own SPM as well as remote SPMs through the Network-On-Chip (NoC). However, the compiler/programmer is required to explicitly manage the movement of data between SPMs and off-chip memory. Utilizing SPMs for multi-threaded applications is even more challenging, as the shared variables across the threads need to be placed appropriately. Accessing variables from remote SPMs with higher access latency further complicates this problem as certain links in the NoC may be heavily contended by multiple threads. Therefore, certain variables may need to be replicated in multiple SPMs to reduce the contention delay and/or the overall access time. We present Coordinated Data Management (CDM), a compile-time framework that automatically identifies shared/private variables and places them with replication (if necessary) to suitable on-chip or off-chip memory, taking NoC contention into consideration. We develop both an exact Integer Linear Programming (ILP) formulation as well as an iterative, scalable algorithm for placing the data variables in multi-threaded applications on many-core SPMs. Experimental evaluation on the Parallella hardware platform confirms that our allocation strategy reduces the overall execution time and energy consumption by $ 1.84 \times $ and $ 1.83 \times $, respectively, when compared to the existing approaches.", acknowledgement = ack-nhfb, articleno = "10", fjournal = "ACM Transactions on Embedded Computing Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J840", } @Article{Wang:2019:MEM, author = "L. Wang and M. Jahre and A. Adileh and Z. Wang and L. Eeckhout", title = "Modeling Emerging Memory-Divergent {GPU} Applications", journal = j-IEEE-COMPUT-ARCHIT-LETT, volume = "18", number = "2", pages = "95--98", month = jul, year = "2019", DOI = "https://doi.org/10.1109/LCA.2019.2923618", ISSN = "1556-6056 (print), 1556-6064 (electronic)", ISSN-L = "1556-6056", bibdate = "Tue Oct 1 10:18:16 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/pvm.bib", abstract = "Analytical performance models yield valuable architectural insight without incurring the excessive runtime overheads of simulation. In this work, we study contemporary GPU applications and find that the key performance-related behavior of such applications is distinct from traditional GPU applications. The key issue is that these GPU applications are memory-intensive and have poor spatial locality, which implies that the loads of different threads commonly access different cache blocks. Such memory-divergent applications quickly exhaust the number of misses the L1 cache can process concurrently, and thereby cripple the GPU's ability to use Memory-Level Parallelism (MLP) and Thread-Level Parallelism (TLP) to hide memory latencies. Our Memory Divergence Model (MDM) is able to accurately represent this behavior and thereby reduces average performance prediction error by $ 14 \times $ compared to the state-of-the-art GPUMech approach across our memory-divergent applications.", acknowledgement = ack-nhfb, fjournal = "IEEE Computer Architecture Letters", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208", keywords = "Analytical models; analytical performance models; Analytical performance prediction; average performance prediction error; cache blocks; cache storage; Computational modeling; contemporary GPU applications; GPU; graphics processing units; Graphics processing units; Instruction sets; key performance-related behavior; L1 cache; Mathematical model; memory architecture; memory divergence model; memory latencies; memory-divergent applications; memory-divergent GPU applications; memory-intensive; memory-level parallelism; multi-threading; multiprocessing systems; Predictive models; Random access memory; thread-level parallelism; traditional GPU applications; valuable architectural insight", } @Article{Wang:2019:SSS, author = "Wenlu Wang and Ji Zhang and Min-Te Sun and Wei-Shinn Ku", title = "A scalable spatial skyline evaluation system utilizing parallel independent region groups", journal = j-VLDB-J, volume = "28", number = "1", pages = "73--98", month = feb, year = "2019", CODEN = "VLDBFR", DOI = "https://doi.org/10.1007/s00778-018-0519-4", ISSN = "1066-8888 (print), 0949-877X (electronic)", ISSN-L = "1066-8888", bibdate = "Tue Feb 5 08:07:20 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/vldbj.bib", abstract = "This research presents two parallel solutions to efficiently address spatial skyline queries. First, we propose a novel concept called independent regions for parallelizing the process of spatial skyline evaluation. Spatial skyline candidates in an independent region do not depend on any data point in other independent regions. Then, we propose a GPU-based solution. We use multi-level independent region group-based parallel filter to support efficient multi-threading spatial skyline non-candidate elimination. Beyond that, we propose comparable region to accelerate non-candidate elimination in each independent region. Secondly, we propose a MapReduce-based solution. We generate the convex hull of query points in the first MapReduce phase. In the second phase, we calculate independent regions based on the input data points and the convex hull of the query points. With the independent regions, spatial skylines are evaluated in parallel in the third phase, in which data points are partitioned by their associated independent regions in map functions, and spatial skyline candidates are calculated by reduce functions. The results of the spatial skyline queries are the union of outputs from the reduce functions. Our experimental results show that GPU multi-threading scheme is very efficient on small-scale input datasets. On the contrary, MapReduce scheme performs very well on large-scale input datasets.", acknowledgement = ack-nhfb, fjournal = "VLDB Journal: Very Large Data Bases", journal-URL = "http://portal.acm.org/toc.cfm?id=J869", } @Article{Watt:2019:WW, author = "Conrad Watt and Andreas Rossberg and Jean Pichon-Pharabod", title = "Weakening {WebAssembly}", journal = j-PACMPL, volume = "3", number = "OOPSLA", pages = "133:1--133:28", month = oct, year = "2019", DOI = "https://doi.org/10.1145/3360559", bibdate = "Fri Aug 7 19:22:30 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/java2010.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/pacmpl.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3360559", abstract = "WebAssembly (Wasm) is a safe, portable virtual instruction set that can be hosted in a wide range of environments, such as a Web browser. It is a low-level language whose instructions are intended to compile directly to bare hardware. While the initial version of Wasm focussed on single-threaded computation, a recent proposal extends it with low-level support for multiple threads and atomic instructions for synchronised access to shared memory. To support the correct compilation of concurrent programs, it is necessary to give a suitable specification of its memory model.\par Wasm's language definition is based on a fully formalised specification that carefully avoids undefined behaviour. We present a substantial extension to this semantics, incorporating a relaxed memory model, along with a few proposed extensions. Wasm's memory model is unique in that its linear address space can be dynamically grown during execution, while all accesses are bounds-checked. This leads to the novel problem of specifying how observations about the size of the memory can propagate between threads. We argue that, considering desirable compilation schemes, we cannot give a sequentially consistent semantics to memory growth.\par We show that our model provides sequential consistency for data-race-free executions (SC-DRF). However, because Wasm is to run on the Web, we must also consider interoperability of its model with that of JavaScript. We show, by counter-example, that JavaScript's memory model is not SC-DRF, in contrast to what is claimed in its specification. We propose two axiomatic conditions that should be added to the JavaScript model to correct this difference.\par We also describe a prototype SMT-based litmus tool which acts as an oracle for our axiomatic model, visualising its behaviours, including memory resizing.", acknowledgement = ack-nhfb, articleno = "133", fjournal = "Proceedings of the ACM on Programming Languages", journal-URL = "https://pacmpl.acm.org/", } @Article{Wu:2019:HUI, author = "Jimmy Ming-Tai Wu and Jerry Chun-Wei Lin and Ashish Tamrakar", title = "High-Utility Itemset Mining with Effective Pruning Strategies", journal = j-TKDD, volume = "13", number = "6", pages = "58:1--58:??", month = dec, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3363571", ISSN = "1556-4681 (print), 1556-472X (electronic)", ISSN-L = "1556-4681", bibdate = "Wed Dec 18 14:31:03 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/tkdd.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3363571", abstract = "High-utility itemset mining is a popular data mining problem that considers utility factors, such as quantity and unit profit of items besides frequency measure from the transactional database. It helps to find the most valuable and profitable products/items that are difficult to track by using only the frequent itemsets. An item might have a high-profit value which is rare in the transactional database and has a tremendous importance. While there are many existing algorithms to find high-utility itemsets (HUIs) that generate comparatively large candidate sets, our main focus is on significantly reducing the computation time with the introduction of new pruning strategies. The designed pruning strategies help to reduce the visitation of unnecessary nodes in the search space, which reduces the time required by the algorithm. In this article, two new stricter upper bounds are designed to reduce the computation time by refraining from visiting unnecessary nodes of an itemset. Thus, the search space of the potential HUIs can be greatly reduced, and the mining procedure of the execution time can be improved. The proposed strategies can also significantly minimize the transaction database generated on each node. Experimental results showed that the designed algorithm with two pruning strategies outperform the state-of-the-art algorithms for mining the required HUIs in terms of runtime and number of revised candidates. The memory usage of the designed algorithm also outperforms the state-of-the-art approach. Moreover, a multi-thread concept is also discussed to further handle the problem of big datasets.", acknowledgement = ack-nhfb, articleno = "58", fjournal = "ACM Transactions on Knowledge Discovery from Data (TKDD)", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J1054", } @Article{Yeh:2019:PGR, author = "Tsung Tai Yeh and Amit Sabne and Putt Sakdhnagool and Rudolf Eigenmann and Timothy G. Rogers", title = "{Pagoda}: a {GPU} Runtime System for Narrow Tasks", journal = j-TOPC, volume = "6", number = "4", pages = "21:1--21:??", month = nov, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3365657", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Wed Nov 20 07:59:59 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/topc.bib", abstract = "Massively multithreaded GPUs achieve high throughput by running thousands of threads in parallel. To fully utilize the their hardware, contemporary workloads spawn work to the GPU in bulk by launching large tasks, where each task is a kernel that contains thousands of threads that occupy the entire GPU. GPUs face severe underutilization and their performance benefits vanish if the tasks are narrow, i.e., they contain less than 512 threads. Latency-sensitive applications in network, signal, and image processing that generate a large number of tasks with relatively small inputs are examples of such limited parallelism. This article presents Pagoda, a runtime system that virtualizes GPU resources, using an OS-like daemon kernel called MasterKernel. Tasks are spawned from the CPU onto Pagoda as they become available, and are scheduled by the MasterKernel at the warp granularity. This level of control enables the GPU to keep scheduling and executing tasks as long as free warps are found, dramatically reducing underutilization. Experimental results on real hardware demonstrate that Pagoda achieves a geometric mean speedup of 5.52X over PThreads running on a 20-core CPU, 1.76X over CUDA-HyperQ, and 1.44X over GeMTC, the state-of-the-art runtime GPU task scheduling system.", acknowledgement = ack-nhfb, articleno = "21", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "http://dl.acm.org/citation.cfm?id=2632163", } @Article{Zhong:2019:SHS, author = "Guanwen Zhong and Akshat Dubey and Cheng Tan and Tulika Mitra", title = "{Synergy}: an {HW\slash SW} Framework for High Throughput {CNNs} on Embedded Heterogeneous {SoC}", journal = j-TECS, volume = "18", number = "2", pages = "13:1--13:??", month = apr, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3301278", ISSN = "1539-9087 (print), 1558-3465 (electronic)", ISSN-L = "1539-9087", bibdate = "Thu Oct 17 18:16:43 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/tecs.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3301278", abstract = "Convolutional Neural Networks (CNN) have been widely deployed in diverse application domains. There has been significant progress in accelerating both their training and inference using high-performance GPUs, FPGAs, and custom ASICs for datacenter-scale environments. The recent proliferation of mobile and Internet of Things (IoT) devices have necessitated real-time, energy-efficient deep neural network inference on embedded-class, resource-constrained platforms. In this context, we present Synergy, an automated, hardware-software co-designed, pipelined, high-throughput CNN inference framework on embedded heterogeneous system-on-chip (SoC) architectures (Xilinx Zynq). Synergy leverages, through multi-threading, all the available on-chip resources, which includes the dual-core ARM processor along with the FPGA and the NEON Single-Instruction Multiple-Data (SIMD) engines as accelerators. Moreover, Synergy provides a unified abstraction of the heterogeneous accelerators (FPGA and NEON) and can adapt to different network configurations at runtime without changing the underlying hardware accelerator architecture by balancing workload across accelerators through work-stealing. Synergy achieves 7.3X speedup, averaged across seven CNN models, over a well-optimized software-only solution. Synergy demonstrates substantially better throughput and energy-efficiency compared to the contemporary CNN implementations on the same SoC architecture.", acknowledgement = ack-nhfb, articleno = "13", fjournal = "ACM Transactions on Embedded Computing Systems", journal-URL = "http://portal.acm.org/browse_dl.cfm?idx=J840", } @Article{Zois:2019:EMM, author = "Vasileios Zois and Vassilis J. Tsotras and Walid A. Najjar", title = "Efficient main-memory top-$k$ selection for multicore architectures", journal = j-PROC-VLDB-ENDOWMENT, volume = "13", number = "2", pages = "114--127", month = oct, year = "2019", CODEN = "????", DOI = "https://doi.org/10.14778/3364324.3364327", ISSN = "2150-8097", bibdate = "Wed Dec 11 07:51:12 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", abstract = "Efficient Top-$k$ query evaluation relies on practices that utilize auxiliary data structures to enable early termination. Such techniques were designed to trade-off complex work in the buffer pool against costly access to disk-resident data. Parallel in-memory Top-$k$ selection with support for early termination presents a novel challenge because computation shifts higher up in the memory hierarchy. In this environment, data scan methods using SIMD instructions and multithreading perform well despite requiring evaluation of the complete dataset. Early termination schemes that favor simplicity require random access to resolve score ambiguity while those optimized for sequential access incur too many object evaluations. In this work, we introduce the concept of rank uncertainty, a measure of work efficiency that enables classifying existing solutions according to their potential for efficient parallel in-memory Top-fc selection. We identify data reordering and layering strategies as those having the highest potential and provide practical guidelines on how to adapt them for parallel in-memory execution (creating the VTA and SLA approaches). In addition, we show that the number of object evaluations can be further decreased by combining data reordering with angle space partitioning (introducing PTA). Our extensive experimental evaluation on varying query parameters using both synthetic and real data, showcase that PTA exhibits between 2 and 4 orders of magnitude better query latency, and throughput when compared to prior work and our optimized algorithmic variants (i.e. VTA, SLA).", acknowledgement = ack-nhfb, fjournal = "Proceedings of the VLDB Endowment", journal-URL = "http://portal.acm.org/citation.cfm?id=J1174", } @Article{Algosaibi:2020:PBT, author = "Abdulelah Algosaibi and Khaled Ragab and Saleh Albahli", title = "Parallel-Based Techniques for Managing and Analyzing the Performance on Semantic Graph", journal = j-PARALLEL-PROCESS-LETT, volume = "30", number = "02", pages = "??--??", month = jun, year = "2020", DOI = "https://doi.org/10.1142/S0129626420500073", ISSN = "0129-6264 (print), 1793-642X (electronic)", ISSN-L = "0129-6264", bibdate = "Mon Mar 29 12:30:13 MDT 2021", bibsource = "http://ejournals.wspc.com.sg/ppl/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/parallelprocesslett.bib", URL = "https://www.worldscientific.com/doi/10.1142/S0129626420500073", abstract = "In recent years, data are generated rapidly that advanced the evolving of the linked data. Modern data are globally distributed over the semantically linked graphs. The nature of the distributed data over the semantic graph raised new demands on further investigation on improving performance on the semantic graphs. In this work, we analyzed the time latency as an important factor to be further investigated and improved. We evaluated the parallel computing on these distributed data in order to better utilize the parallelism approaches. A federation framework based on a multi-threaded environment supporting federated SPARQL query was introduced. In our experiments, we show the achievability and effectiveness of our model on a set of real-world quires through real-world Linked Open Data cloud. Significant performance improvement has noticed. Further, we highlight short-comings that could open an avenue in the research of federated queries. Keywords: Semantic web; distributed query processing; query federation; linked data; join methods.", acknowledgement = ack-nhfb, fjournal = "Parallel Processing Letters", journal-URL = "http://www.worldscientific.com/loi/ppl", } @Article{Bagherzadeh:2020:ACB, author = "Mehdi Bagherzadeh and Nicholas Fireman and Anas Shawesh and Raffi Khatchadourian", title = "Actor concurrency bugs: a comprehensive study on symptoms, root causes, {API} usages, and differences", journal = j-PACMPL, volume = "4", number = "OOPSLA", pages = "214:1--214:32", month = nov, year = "2020", DOI = "https://doi.org/10.1145/3428282", bibdate = "Tue Mar 30 08:10:50 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/pacmpl.bib", URL = "https://dl.acm.org/doi/10.1145/3428282", abstract = "Actor concurrency is becoming increasingly important in the development of real-world software systems. Although actor concurrency may be less susceptible to some multithreaded concurrency bugs, such as low-level data races and deadlocks, it comes with \ldots{}", acknowledgement = ack-nhfb, articleno = "214", fjournal = "Proceedings of the ACM on Programming Languages", journal-URL = "https://pacmpl.acm.org/", } @InProceedings{Barros:2020:ALS, author = "D. A. Barros and C. Bentes", booktitle = "{2020 IEEE 32nd International Symposium on Computer Architecture and High Performance Computing (SBAC-PAD)}", title = "Analyzing the Loop Scheduling Mechanisms on {Julia} Multithreading", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "257--264", year = "2020", DOI = "https://doi.org/10.1109/SBAC-PAD49847.2020.00043", bibdate = "Thu Apr 8 07:17:08 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/julia.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Julia programming language", } @Article{Castello:2020:ATL, author = "A. Castell{\'o} and R. M. Gual and S. Seo and P. Balaji and E. S. Quintana-Ort{\'\i} and A. J. Pe{\~n}a", title = "Analysis of Threading Libraries for High Performance Computing", journal = j-IEEE-TRANS-COMPUT, volume = "69", number = "9", pages = "1279--1292", month = sep, year = "2020", CODEN = "ITCOB4", ISSN = "0018-9340 (print), 1557-9956 (electronic)", ISSN-L = "0018-9340", bibdate = "Wed Aug 12 14:58:16 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2020.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Computers", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12", } @Article{Criswell:2020:SPC, author = "K. Criswell and T. Adegbija", title = "A Survey of Phase Classification Techniques for Characterizing Variable Application Behavior", journal = j-IEEE-TRANS-PAR-DIST-SYS, volume = "31", number = "1", pages = "224--236", month = jan, year = "2020", CODEN = "ITDSEO", DOI = "https://doi.org/10.1109/TPDS.2019.2929781", ISSN = "1045-9219 (print), 1558-2183 (electronic)", ISSN-L = "1045-9219", bibdate = "Thu Dec 19 09:20:35 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Parallel and Distributed Systems", journal-URL = "http://www.computer.org/portal/web/csdl/transactions/tpds", keywords = "adaptable computing; Big Data; big data; Clocks; Computational modeling; dynamic optimization; edge computing; emerging applications; Hardware; Multicore processing; multithreaded applications; Optimization; Phase classification; Runtime; variable program behavior; workload characterization", } @Article{Cugu:2020:PMS, author = "Ilke {\c{C}}ugu and Murat Manguoglu", title = "A parallel multithreaded sparse triangular linear system solver", journal = j-COMPUT-MATH-APPL, volume = "80", number = "2", pages = "371--385", month = jul, year = "2020", CODEN = "CMAPDK", DOI = "https://doi.org/10.1016/j.camwa.2019.09.012", ISSN = "0898-1221 (print), 1873-7668 (electronic)", ISSN-L = "0898-1221", bibdate = "Wed Jul 8 08:11:16 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/computmathappl2020.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.sciencedirect.com/science/article/pii/S0898122119304602", acknowledgement = ack-nhfb, fjournal = "Computers and Mathematics with Applications", journal-URL = "http://www.sciencedirect.com/science/journal/08981221", } @Article{Dosanjh:2020:TQM, author = "Matthew G. F. Dosanjh and Ryan E. Grant and Whit Schonbein and Patrick G. Bridges", title = "Tail queues: a multi-threaded matching architecture", journal = j-CCPE, volume = "32", number = "3", pages = "e5158:1--e5158:??", day = "10", month = feb, year = "2020", CODEN = "CCPEBO", DOI = "https://doi.org/10.1002/cpe.5158", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Wed Mar 31 07:52:13 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ccpe.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, ajournal = "Concurr. Comput.", fjournal = "Concurrency and Computation: Practice and Experience", journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626", onlinedate = "06 February 2019", } @Article{Feliu:2020:TII, author = "J. Feliu and J. Sahuquillo and S. Petit and L. Eeckhout", title = "Thread Isolation to Improve Symbiotic Scheduling on {SMT} Multicore Processors", journal = j-IEEE-TRANS-PAR-DIST-SYS, volume = "31", number = "2", pages = "359--373", month = feb, year = "2020", CODEN = "ITDSEO", DOI = "https://doi.org/10.1109/TPDS.2019.2934955", ISSN = "1045-9219 (print), 1558-2183 (electronic)", ISSN-L = "1045-9219", bibdate = "Wed Jan 22 06:09:50 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Parallel and Distributed Systems", journal-URL = "http://www.computer.org/portal/web/csdl/transactions/tpds", keywords = "Degradation; Message systems; Program processors; Resource management; Schedules; Simultaneous multithreading (SMT); Symbiosis; symbiotic job scheduling; thread isolation; Throughput", } @Article{Fezzardi:2020:ABD, author = "Pietro Fezzardi and Fabrizio Ferrandi", title = "Automated Bug Detection for High-level Synthesis of Multi-threaded Irregular Applications", journal = j-TOPC, volume = "7", number = "4", pages = "27:1--27:26", month = dec, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3418086", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Sun Mar 28 08:05:40 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/topc.bib", URL = "https://dl.acm.org/doi/10.1145/3418086", abstract = "Field Programmable Gate Arrays (FPGAs) are becoming an appealing technology in datacenters and High Performance Computing. High-Level Synthesis (HLS) of multi-threaded parallel programs is increasingly used to extract parallelism. Despite great leaps \ldots{}", acknowledgement = ack-nhfb, articleno = "27", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "https://dl.acm.org/loi/topc", } @Article{Ghorbani:2020:RDT, author = "Mehrdad Ghorbani and Seyed Morteza Babamir", title = "Runtime deadlock tracking and prevention of concurrent multithreaded programs: a learning-based approach", journal = j-CCPE, volume = "32", number = "10", pages = "e5324:1--e5324:??", day = "25", month = may, year = "2020", CODEN = "CCPEBO", DOI = "https://doi.org/10.1002/cpe.5324", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Wed Mar 31 07:52:16 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ccpe.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, ajournal = "Concurr. Comput.", fjournal = "Concurrency and Computation: Practice and Experience", journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626", onlinedate = "09 May 2019", } @Article{Hickey:2020:HC, author = "Rich Hickey", title = "A history of {Clojure}", journal = j-PACMPL, volume = "4", number = "HOPL", pages = "71:1--71:46", month = jun, year = "2020", DOI = "https://doi.org/10.1145/3386321", bibdate = "Fri Aug 7 17:39:13 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/java2020.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/pacmpl.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3386321", abstract = "Clojure was designed to be a general-purpose, practical functional language, suitable for use by professionals wherever its host language, e.g., Java, would be. Initially designed in 2005 and released in 2007, Clojure is a dialect of Lisp, but is not a direct descendant of any prior Lisp. It complements programming with pure functions of immutable data with concurrency-safe state management constructs that support writing correct multithreaded programs without the complexity of mutex locks.\par Clojure is intentionally hosted, in that it compiles to and runs on the runtime of another language, such as the JVM. This is more than an implementation strategy; numerous features ensure that programs written in Clojure can leverage and interoperate with the libraries of the host language directly and efficiently.\par In spite of combining two (at the time) rather unpopular ideas, functional programming and Lisp, Clojure has since seen adoption in industries as diverse as finance, climate science, retail, databases, analytics, publishing, healthcare, advertising and genomics, and by consultancies and startups worldwide, much to the career-altering surprise of its author.\par Most of the ideas in Clojure were not novel, but their combination puts Clojure in a unique spot in language design (functional, hosted, Lisp). This paper recounts the motivation behind the initial development of Clojure and the rationale for various design decisions and language constructs. It then covers its evolution subsequent to release and adoption.", acknowledgement = ack-nhfb, articleno = "71", fjournal = "Proceedings of the ACM on Programming Languages", journal-URL = "https://pacmpl.acm.org/", } @Article{Im:2020:DWF, author = "Sungjin Im and Benjamin Moseley and Kamesh Munagala and Kirk Pruhs", title = "Dynamic Weighted Fairness with Minimal Disruptions", journal = j-POMACS, volume = "4", number = "1", pages = "19:1--19:18", month = may, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3379485", ISSN = "2476-1249", ISSN-L = "2476-1249", bibdate = "Mon Mar 29 10:31:33 MDT 2021", bibsource = "http://portal.acm.org/https://www.math.utah.edu/pub/tex/bib/pomacs.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "https://dl.acm.org/doi/10.1145/3379485", abstract = "In this paper, we consider the following dynamic fair allocation problem: Given a sequence of job arrivals and departures, the goal is to maintain an approximately fair allocation of the resource against a target fair allocation policy, while minimizing he total number of disruptions, which is the number of times the allocation of any job is changed. We consider a rich class of fair allocation policies that significantly generalize those considered in previous work. We first consider the models where jobs only arrive, or jobs only depart. We present tight upper and lower bounds for the number of disruptions required to maintain a constant approximate fair allocation every time step. In particular, for the canonical case where jobs have weights and the resource allocation is proportional to the job's weight, we show that maintaining a constant approximate fair allocation requires $ \Theta (\log^* n) $ disruptions per job, almost matching the bounds in prior work for the unit weight case. For the more general setting where the allocation policy only decreases the allocation to a job when new jobs arrive, we show that maintaining a constant approximate fair allocation requires $ \Thta (\log n) $ disruptions per job. We then consider the model where jobs can both arrive and depart. We first show strong lower bounds on the number of disruptions required to maintain constant approximate fairness for arbitrary instances. In contrast we then show that there there is an algorithm that can maintain constant approximate fairness with $ O(1) $ expected disruptions per job if the weights of the jobs are independent of the jobs arrival and departure order. We finally show how our results can be extended to the setting with multiple resources.", acknowledgement = ack-nhfb, articleno = "19", fjournal = "Proceedings of the ACM on Measurement and Analysis of Computing Systems (POMACS)", journal-URL = "https://dl.acm.org/loi/pomacs", } @Article{Langr:2020:RII, author = "Daniel Langr and Marin Ko{\v{c}}i{\v{c}}ka", title = "Reducing the Impact of Intensive Dynamic Memory Allocations in Parallel Multi-Threaded Programs", journal = j-IEEE-TRANS-PAR-DIST-SYS, volume = "31", number = "5", pages = "1152--1164", month = may, year = "2020", CODEN = "ITDSEO", DOI = "https://doi.org/10.1109/TPDS.2019.2960514", ISSN = "1045-9219 (print), 1558-2183 (electronic)", ISSN-L = "1045-9219", bibdate = "Thu Feb 20 10:08:58 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Parallel and Distributed Systems", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=71", keywords = "Dynamic memory allocation; memory pooling; multi-threading; parallel program; scalable heap implementation; shared memory; small buffer optimization", } @Article{Li:2020:MMT, author = "Tao Li and Xiankai Zhang and Feng Luo and Fang-Xiang Wu and Jianxin Wang", title = "{MultiMotifMaker}: a Multi-Thread Tool for Identifying {DNA} Methylation Motifs from {Pacbio} Reads", journal = j-TCBB, volume = "17", number = "1", pages = "220--225", month = jan, year = "2020", CODEN = "ITCBCY", DOI = "https://doi.org/10.1109/TCBB.2018.2861399", ISSN = "1545-5963 (print), 1557-9964 (electronic)", ISSN-L = "1545-5963", bibdate = "Wed Jun 10 07:29:48 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/tcbb.bib", URL = "https://dl.acm.org/doi/abs/10.1109/TCBB.2018.2861399", abstract = "The methylation of DNA is an important mechanism to control biological processes. Recently, the Pacbio SMRT technology provides a new way to identify base methylation in the genome. MotifMaker is a tool developed by Pacbio for discovering DNA methylation \ldots{}", acknowledgement = ack-nhfb, fjournal = "IEEE/ACM Transactions on Computational Biology and Bioinformatics", journal-URL = "https://dl.acm.org/loi/tcbb", } @Article{Puche:2020:ECF, author = "Jos{\'e} Puche and Salvador Petit and Mar{\'\i}a E. G{\'o}mez and Julio Sahuquillo", title = "An efficient cache flat storage organization for multithreaded workloads for low power processors", journal = j-FUT-GEN-COMP-SYS, volume = "110", number = "??", pages = "1037--1054", month = sep, year = "2020", CODEN = "FGSEVI", DOI = "https://doi.org/10.1016/j.future.2019.11.024", ISSN = "0167-739X (print), 1872-7115 (electronic)", ISSN-L = "0167-739X", bibdate = "Fri Jun 19 07:44:19 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/futgencompsys.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.sciencedirect.com/science/article/pii/S0167739X1930384X", acknowledgement = ack-nhfb, fjournal = "Future Generation Computer Systems", journal-URL = "http://www.sciencedirect.com/science/journal/0167739X", } @Article{Tino:2020:SXE, author = "Anita Tino and Caroline Collange and Andr{\'e} Seznec", title = "{SIMT-X}: Extending Single-Instruction Multi-Threading to Out-of-Order Cores", journal = j-TACO, volume = "17", number = "2", pages = "15:1--15:23", month = jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3392032", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jun 27 12:06:50 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3392032", abstract = "This work introduces Single Instruction Multi-Thread Express (SIMT-X), a general-purpose Central Processing Unit (CPU) microarchitecture that enables Graphics Processing Units (GPUs)-style SIMT execution across multiple threads of the same program for high throughput, while retaining the latency benefits of out-of-order execution, and the programming convenience of homogeneous multi-thread processors. SIMT-X leverages the existing Single Instruction Multiple Data (SIMD) back-end to provide CPU/GPU-like processing on a single core with minimal overhead. We demonstrate that although SIMT-X invokes a restricted form of Out-of-Order (OoO), the microarchitecture successfully captures a majority of the benefits of aggressive OoO execution using at most two concurrent register mappings per architectural register, while addressing issues of partial dependencies and supporting a general-purpose Instruction Set Architecture (ISA).", acknowledgement = ack-nhfb, articleno = "15", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wenjie:2020:APW, author = "Tang Wenjie and Yao Yiping and Li Tianlin and Song Xiao and Zhu Feng", title = "An Adaptive Persistence and Work-stealing Combined Algorithm for Load Balancing on Parallel Discrete Event Simulation", journal = j-TOMACS, volume = "30", number = "2", pages = "12:1--12:26", month = apr, year = "2020", CODEN = "ATMCEZ", DOI = "https://doi.org/10.1145/3364218", ISSN = "1049-3301 (print), 1558-1195 (electronic)", ISSN-L = "1049-3301", bibdate = "Tue Apr 21 08:08:16 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/tomacs.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3364218", abstract = "Load imbalance has always been a crucial challenge in Parallel Discrete Event Simulation (PDES). In the past few years, we have witnessed an increased interest in using multithreading PDES on multi/many-core platforms. In multithreading PDES, migrating \ldots{}", acknowledgement = ack-nhfb, articleno = "12", fjournal = "ACM Transactions on Modeling and Computer Simulation", journal-URL = "https://dl.acm.org/loi/tomacs", } @Misc{Yee:2020:CMT, author = "Alexander J. Yee", title = "{{\tt y-cruncher}}: a multi-threaded pi-program", howpublished = "Web site", day = "30", month = mar, year = "2020", bibdate = "Tue Apr 21 16:09:31 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/pi.bib", URL = "http://www.numberworld.org/y-cruncher/", abstract = "How fast can your computer compute Pi?\par y-cruncher is a program that can compute Pi and other constants to trillions of digits.\par It is the first of its kind that is multi-threaded and scalable to multi-core systems. Ever since its launch in 2009, it has become a common benchmarking and stress-testing application for overclockers and hardware enthusiasts.\par y-cruncher has been used to set several world records for the most digits of Pi ever computed:\par 50 trillion digits - January 2020 (Timothy Mullican)\par 31.4 trillion digits - January 2019 (Emma Haruka Iwao)\par 22.4 trillion digits - November 2016 (Peter Trueb)\par 13.3 trillion digits - October 2014 (Sandon Van Ness ``houkouonchi'')\par 12.1 trillion digits - December 2013 (Shigeru Kondo)\par 10 trillion digits - October 2011 (Shigeru Kondo)\par 5 trillion digits - August 2010 (Shigeru Kondo)", acknowledgement = ack-nhfb, } @Article{Yin:2020:SCA, author = "L. Yin and W. Dong and W. Liu and J. Wang", title = "On Scheduling Constraint Abstraction for Multi-Threaded Program Verification", journal = j-IEEE-TRANS-SOFTW-ENG, volume = "46", number = "5", pages = "549--565", year = "2020", CODEN = "IESEDJ", ISSN = "0098-5589 (print), 1939-3520 (electronic)", ISSN-L = "0098-5589", bibdate = "Thu Sep 17 07:36:32 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranssoftweng2020.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Software Engineering", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=32", } @Article{Akbari:2021:EMT, author = "Amir Akbari and Dennis Giannacopoulos", title = "An efficient multi-threaded {Newton--Raphson} algorithm for strong coupling modeling of multi-physics problems", journal = j-COMP-PHYS-COMM, volume = "258", number = "??", pages = "Article 107563", month = jan, year = "2021", CODEN = "CPHCBZ", DOI = "https://doi.org/10.1016/j.cpc.2020.107563", ISSN = "0010-4655 (print), 1879-2944 (electronic)", ISSN-L = "0010-4655", bibdate = "Sat Mar 13 08:21:40 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/compphyscomm2020.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.sciencedirect.com/science/article/pii/S0010465520302708", acknowledgement = ack-nhfb, fjournal = "Computer Physics Communications", journal-URL = "http://www.sciencedirect.com/science/journal/00104655", } @Article{Arman:2021:OHP, author = "Arif Arman and Dmitri Loguinov", title = "{Origami}: a high-performance mergesort framework", journal = j-PROC-VLDB-ENDOWMENT, volume = "15", number = "2", pages = "259--271", month = oct, year = "2021", CODEN = "????", DOI = "https://doi.org/10.14778/3489496.3489507", ISSN = "2150-8097", bibdate = "Sat Feb 5 06:26:54 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/vldbe.bib", URL = "https://dl.acm.org/doi/10.14778/3489496.3489507", abstract = "Mergesort is a popular algorithm for sorting real-world workloads as it is immune to data skewness, suitable for parallelization using vectorized intrinsics, and relatively simple to multi-thread. In this paper, we introduce Origami, an in-memory merge-. \ldots{}", acknowledgement = ack-nhfb, fjournal = "Proceedings of the VLDB Endowment", journal-URL = "https://dl.acm.org/loi/pvldb", } @Article{Arslan:2021:ESR, author = "Sanem Arslan and Osman Unsal", title = "Efficient selective replication of critical code regions for {SDC} mitigation leveraging redundant multithreading", journal = j-J-SUPERCOMPUTING, volume = "77", number = "12", pages = "14130--14160", month = dec, year = "2021", CODEN = "JOSUED", DOI = "https://doi.org/10.1007/s11227-021-03804-6", ISSN = "0920-8542 (print), 1573-0484 (electronic)", ISSN-L = "0920-8542", bibdate = "Mon Feb 28 16:44:31 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/jsuper2020.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "https://link.springer.com/article/10.1007/s11227-021-03804-6", acknowledgement = ack-nhfb, ajournal = "J. Supercomputing", fjournal = "The Journal of Supercomputing", journal-URL = "http://link.springer.com/journal/11227", } @InProceedings{Barbirotta:2021:FTS, author = "Marcello Barbirotta and Abdallah Cheikh and Antonio Mastrandrea and Francesco Menichelli and Francesco Vigli and Mauro Olivieri", editor = "{IEEE}", booktitle = "{2021 IEEE International Symposium on Defect and Fault Tolerance in VLSI and Nanotechnology Systems (DFT)}", title = "A Fault Tolerant soft-core obtained from an Interleaved-Multi-Threading {RISC-V} microprocessor design", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "1--4", year = "2021", DOI = "https://doi.org/10.1109/DFT52944.2021.9568368", bibdate = "Sat Dec 16 15:51:40 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/risc-v.bib", acknowledgement = ack-nhfb, } @Article{Baumann:2021:CBV, author = "Pascal Baumann and Rupak Majumdar and Ramanathan S. Thinniyam and Georg Zetzsche", title = "Context-bounded verification of liveness properties for multithreaded shared-memory programs", journal = j-PACMPL, volume = "5", number = "POPL", pages = "44:1--44:31", month = jan, year = "2021", DOI = "https://doi.org/10.1145/3434325", bibdate = "Tue Mar 30 08:10:58 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/pacmpl.bib", URL = "https://dl.acm.org/doi/10.1145/3434325", abstract = "We study context-bounded verification of liveness properties of multi-threaded, shared-memory programs, where each thread can spawn additional threads. Our main result shows that context-bounded fair termination is decidable for the model; context-. \ldots{}", acknowledgement = ack-nhfb, articleno = "44", fjournal = "Proceedings of the ACM on Programming Languages", journal-URL = "https://pacmpl.acm.org/", } @Article{Carroll:2021:ELT, author = "Shane Carroll and Wei-ming Lin", title = "Exploiting Long-Term Temporal Cache Access Patterns for {LRU} Insertion Prioritization", journal = j-PARALLEL-PROCESS-LETT, volume = "31", number = "02", pages = "??--??", month = jun, year = "2021", DOI = "https://doi.org/10.1142/S0129626421500109", ISSN = "0129-6264 (print), 1793-642X (electronic)", ISSN-L = "0129-6264", bibdate = "Thu Feb 17 06:50:36 MST 2022", bibsource = "http://ejournals.wspc.com.sg/ppl/; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/parallelprocesslett.bib", URL = "https://www.worldscientific.com/doi/10.1142/S0129626421500109", abstract = "In a CPU cache utilizing least recently used (LRU) replacement, cache sets manage a buffer which orders all cache lines in the set from LRU to most recently used (MRU). When a cache line is brought into cache, it is placed at the MRU and the LRU line is evicted. When re-accessed, a line is promoted to the MRU position. LRU replacement provides a simple heuristic to predict the optimal cache line to evict. However, LRU utilizes only simple, short-term access patterns. In this paper, we propose a method that uses a buffer called the history queue to record longer-term access-eviction patterns than the LRU buffer can capture. Using this information, we make a simple modification to LRU insertion policy such that recently-recalled blocks have priority over others. As lines are evicted, their addresses are recorded in a FIFO history queue. Incoming lines that have recently been evicted and now recalled (those in the history queue at recall time) remain in the MRU for an extended period of time as non-recalled lines entering the cache thereafter are placed below the MRU. We show that the proposed LRU insertion prioritization increases performance in single-threaded and multi-threaded workloads in simulations with simple adjustments to baseline LRU.", acknowledgement = ack-nhfb, articleno = "2150010", fjournal = "Parallel Processing Letters", journal-URL = "http://www.worldscientific.com/loi/ppl", } @Article{Cheikh:2021:KDV, author = "A. Cheikh and S. Sordillo and A. Mastrandrea and F. Menichelli and G. Scotti and M. Olivieri", title = "{Klessydra-T}: Designing Vector Coprocessors for Multithreaded Edge-Computing Cores", journal = j-IEEE-MICRO, volume = "41", number = "2", pages = "64--71", month = mar # "\slash " # apr, year = "2021", CODEN = "IEMIDZ", DOI = "https://doi.org/10.1109/MM.2021.3050962", ISSN = "0272-1732 (print), 1937-4143 (electronic)", ISSN-L = "0272-1732", bibdate = "Thu Apr 1 10:32:23 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/hot-chips.bib; https://www.math.utah.edu/pub/tex/bib/ieeemicro.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Micro", journal-URL = "http://www.computer.org/csdl/mags/mi/index.html", } @Article{Conoci:2021:PCP, author = "Stefano Conoci and Pierangelo {Di Sanzo} and Alessandro Pellegrini and Bruno Ciciani and Francesco Quaglia", title = "On power capping and performance optimization of multithreaded applications", journal = j-CCPE, volume = "33", number = "13", pages = "e6205:1--e6205:??", day = "10", month = jul, year = "2021", CODEN = "CCPEBO", DOI = "https://doi.org/10.1002/cpe.6205", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Tue Feb 22 09:49:54 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/ccpe2020.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, ajournal = "Concurrency Computat., Pract. Exper.", fjournal = "Concurrency and Computation: Practice and Experience", journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626", onlinedate = "27 January 2021", } @Article{Kozicky:2021:JDT, author = "Claudio Kozick{\'y} and Ivan Simecek", title = "Joint direct and transposed sparse matrix-vector multiplication for multithreaded {CPUs}", journal = j-CCPE, volume = "33", number = "13", pages = "e6236:1--e6236:??", day = "10", month = jul, year = "2021", CODEN = "CCPEBO", DOI = "https://doi.org/10.1002/cpe.6236", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Tue Feb 22 09:49:54 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/ccpe2020.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, ajournal = "Concurrency Computat., Pract. Exper.", fjournal = "Concurrency and Computation: Practice and Experience", journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626", onlinedate = "22 February 2021", } @Article{Li:2021:MEC, author = "Botao Li and Synge Todo and A. C. Maggs and Werner Krauth", title = "Multithreaded event-chain {Monte Carlo} with local times", journal = j-COMP-PHYS-COMM, volume = "261", number = "??", pages = "Article 107702", month = apr, year = "2021", CODEN = "CPHCBZ", DOI = "https://doi.org/10.1016/j.cpc.2020.107702", ISSN = "0010-4655 (print), 1879-2944 (electronic)", ISSN-L = "0010-4655", bibdate = "Sat Mar 13 08:21:42 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/compphyscomm2020.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.sciencedirect.com/science/article/pii/S0010465520303453", acknowledgement = ack-nhfb, fjournal = "Computer Physics Communications", journal-URL = "http://www.sciencedirect.com/science/journal/00104655", } @Article{Ma:2021:RTB, author = "Xiaoxue Ma and Shangru Wu and Ernest Pobee and Xiupei Mei and Hao Zhang and Bo Jiang and Wing-Kwong Chan", title = "{RegionTrack}: a Trace-Based Sound and Complete Checker to Debug Transactional Atomicity Violations and Non-Serializable Traces", journal = j-TOSEM, volume = "30", number = "1", pages = "7:1--7:49", month = jan, year = "2021", CODEN = "ATSMER", DOI = "https://doi.org/10.1145/3412377", ISSN = "1049-331X (print), 1557-7392 (electronic)", ISSN-L = "1049-331X", bibdate = "Fri Jan 22 07:02:14 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/tosem.bib", URL = "https://dl.acm.org/doi/10.1145/3412377", abstract = "Atomicity is a correctness criterion to reason about isolated code regions in a multithreaded program when they are executed concurrently. However, dynamic instances of these code regions, called transactions, may fail to behave atomically, resulting in \ldots{}", acknowledgement = ack-nhfb, articleno = "7", fjournal = "ACM Transactions on Software Engineering and Methodology", journal-URL = "https://dl.acm.org/loi/tosem", } @Article{Mattson:2021:PPM, author = "Timothy G. Mattson and Todd A. Anderson and Giorgis Georgakoudis", title = "\pkg{PyOMP}: Multithreaded Parallel Programming in {Python}", journal = j-COMPUT-SCI-ENG, volume = "23", number = "6", pages = "77--80", month = nov # "\slash " # dec, year = "2021", CODEN = "CSENFA", DOI = "https://doi.org/10.1109/MCSE.2021.3128806", ISSN = "1521-9615 (print), 1558-366X (electronic)", ISSN-L = "1521-9615", bibdate = "Mon Jan 31 16:30:09 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/computscieng.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/python.bib", acknowledgement = ack-nhfb, fjournal = "Computing in Science and Engineering", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=5992", } @Article{Metzger:2021:DHT, author = "Paul Metzger and Volker Seeker and Christian Fensch and Murray Cole", title = "Device Hopping: Transparent Mid-Kernel Runtime Switching for Heterogeneous Systems", journal = j-TACO, volume = "18", number = "4", pages = "57:1--57:25", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3471909", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Oct 4 07:14:07 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3471909", abstract = "Existing OS techniques for homogeneous many-core systems make it simple for single and multithreaded applications to migrate between cores. Heterogeneous systems do not benefit so fully from this flexibility, and applications that cannot migrate in mid-. \ldots{}", acknowledgement = ack-nhfb, articleno = "57", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Nagler:2021:CSR, author = "Thomas Nagler", title = "Code Snippet: {R}-Friendly Multi-Threading in {C++}", journal = j-J-STAT-SOFT, volume = "97", number = "??", pages = "??--??", month = "????", year = "2021", CODEN = "JSSOBK", DOI = "https://doi.org/10.18637/jss.v97.c01", ISSN = "1548-7660", ISSN-L = "1548-7660", bibdate = "Wed May 19 07:43:42 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/jstatsoft.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "https://www.jstatsoft.org/index.php/jss/article/view/v097c01; https://www.jstatsoft.org/index.php/jss/article/view/v097c01/v97c01.pdf", acknowledgement = ack-nhfb, journal-URL = "http://www.jstatsoft.org/", } @Article{Park:2021:GTA, author = "Jiwon Park and Dominik Winterer and Chengyu Zhang and Zhendong Su", title = "Generative type-aware mutation for testing {SMT} solvers", journal = j-PACMPL, volume = "5", number = "OOPSLA", pages = "152:1--152:19", month = oct, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3485529", ISSN = "2475-1421 (electronic)", ISSN-L = "2475-1421", bibdate = "Wed Mar 2 07:00:43 MST 2022", bibsource = "http://www.math.utah.edu/pub/tex/bib/pacmpl.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "https://dl.acm.org/doi/10.1145/3485529", abstract = "We propose Generative Type-Aware Mutation, an effective approach for testing SMT solvers. The key idea is to realize generation through the mutation of expressions rooted with parametric operators from the SMT-LIB specification. Generative Type-Aware \ldots{}", acknowledgement = ack-nhfb, articleno = "152", fjournal = "Proceedings of the ACM on Programming Languages (PACMPL)", journal-URL = "https://dl.acm.org/loi/pacmpl", } @Article{Romanous:2021:ELL, author = "Bashar Romanous and Skyler Windh and Vassilis Tsotras", title = "Efficient local locking for massively multithreaded in-memory hash-based operators", journal = j-VLDB-J, volume = "30", number = "3", pages = "333--359", month = may, year = "2021", CODEN = "VLDBFR", DOI = "https://doi.org/10.1007/s00778-020-00642-5", ISSN = "1066-8888 (print), 0949-877X (electronic)", ISSN-L = "1066-8888", bibdate = "Sat Apr 9 10:33:58 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/vldbj.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "https://link.springer.com/article/10.1007/s00778-020-00642-5", acknowledgement = ack-nhfb, ajournal = "VLDB J.", fjournal = "VLDB Journal: Very Large Data Bases", journal-URL = "http://portal.acm.org/toc.cfm?id=J869", } @Article{Sonenberg:2021:PAW, author = "Nikki Sonenberg and Grzegorz Kielanski and Benny {Van Houdt}", title = "Performance Analysis of Work Stealing in Large-scale Multithreaded Computing", journal = j-TOMPECS, volume = "6", number = "2", pages = "6:1--6:28", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3470887", ISSN = "2376-3639 (print), 2376-3647 (electronic)", ISSN-L = "2376-3639", bibdate = "Wed Mar 2 06:32:09 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/tompecs.bib", URL = "https://dl.acm.org/doi/10.1145/3470887", abstract = "Randomized work stealing is used in distributed systems to increase performance and improve resource utilization. In this article, we consider randomized work stealing in a large system of homogeneous processors where parent jobs spawn child jobs that can \ldots{}", acknowledgement = ack-nhfb, articleno = "6", fjournal = "ACM Transactions on Modeling and Performance Evaluation of Computing Systems (TOMPECS)", journal-URL = "https://dl.acm.org/loi/tompecs", } @Article{Steele:2021:PLB, author = "Guy L. {Steele Jr.} and Sebastiano Vigna", title = "\pkg{LXM}: better splittable pseudorandom number generators (and almost as fast)", journal = j-PACMPL, volume = "5", number = "OOPSLA", pages = "148:1--148:31", month = oct, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3485525", ISSN = "2475-1421 (electronic)", ISSN-L = "2475-1421", bibdate = "Wed Mar 2 07:00:43 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/pacmpl.bib; https://www.math.utah.edu/pub/tex/bib/prng.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "https://dl.acm.org/doi/10.1145/3485525", abstract = "In 2014, Steele, Lea, and Flood presented SplitMix, an object-oriented pseudorandom number generator (prng) that is quite fast (9 64-bit arithmetic/logical operations per 64 bits generated) and also splittable. A conventional prng object provides a generate method that returns one pseudorandom value and updates the state of the prng; a splittable prng object also has a second operation, split, that replaces the original prng object with two (seemingly) independent prng objects, by creating and returning a new such object and updating the state of the original object. Splittable prng objects make it easy to organize the use of pseudorandom numbers in multithreaded programs structured using fork-join parallelism. This overall strategy still appears to be sound, but the specific arithmetic calculation used for generate in the SplitMix algorithm has some detectable weaknesses, and the period of any one generator is limited to 264.\par Here we present the LXM family of prng algorithms. The idea is an old one: combine the outputs of two independent prng algorithms, then (optionally) feed the result to a mixing function. An LXM algorithm uses a linear congruential subgenerator and an F2-linear subgenerator; the examples studied in this paper use a linear congruential generator (LCG) of period 216, 232, 264, or 2128 with one of the multipliers recommended by L'Ecuyer or by Steele and Vigna, and an F2-linear xor-based generator (XBG) of the xoshiro family or xoroshiro family as described by Blackman and Vigna. For mixing functions we study the MurmurHash3 finalizer function; variants by David Stafford, Doug Lea, and degski; and the null (identity) mixing function.\par Like SplitMix, LXM provides both a generate operation and a split operation. Also like SplitMix, LXM requires no locking or other synchronization (other than the usual memory fence after instance initialization), and is suitable for use with simd instruction sets because it has no branches or loops.\par We analyze the period and equidistribution properties of LXM generators, and present the results of thorough testing of specific members of this family, using the TestU01 and PractRand test suites, not only on single instances of the algorithm but also for collections of instances, used in parallel, ranging in size from 2 to 224. Single instances of LXM that include a strong mixing function appear to have no major weaknesses, and LXM is significantly more robust than SplitMix against accidental correlation in a multithreaded setting. We believe that LXM, like SplitMix, is suitable for ``everyday'' scientific and machine-learning applications (but not cryptographic applications), especially when concurrent threads or distributed processes are involved.", acknowledgement = ack-nhfb, articleno = "148", fjournal = "Proceedings of the ACM on Programming Languages (PACMPL)", journal-URL = "https://dl.acm.org/loi/pacmpl", } @Article{Tang:2021:MMR, author = "Xulong Tang and Mahmut Taylan Kandemir and Mustafa Karakoy", title = "Mix and Match: Reorganizing Tasks for Enhancing Data Locality", journal = j-POMACS, volume = "5", number = "2", pages = "20:1--20:24", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3460087", ISSN = "2476-1249", ISSN-L = "2476-1249", bibdate = "Wed Mar 2 06:36:38 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/pomacs.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "https://dl.acm.org/doi/10.1145/3460087", abstract = "Application programs that exhibit strong locality of reference lead to minimized cache misses and better performance in different architectures. However, to maximize the performance of multithreaded applications running on emerging manycore systems, \ldots{}", acknowledgement = ack-nhfb, articleno = "20", fjournal = "Proceedings of the ACM on Measurement and Analysis of Computing Systems (POMACS)", journal-URL = "https://dl.acm.org/loi/pomacs", } @Article{Tao:2021:CDS, author = "Xiaohan Tao and Jianmin Pang and Yu Zhu", title = "Compiler-directed scratchpad memory data transfer optimization for multithreaded applications on a heterogeneous many-core architecture", journal = j-J-SUPERCOMPUTING, volume = "77", number = "12", pages = "14502--14524", month = dec, year = "2021", CODEN = "JOSUED", DOI = "https://doi.org/10.1007/s11227-021-03853-x", ISSN = "0920-8542 (print), 1573-0484 (electronic)", ISSN-L = "0920-8542", bibdate = "Mon Feb 28 16:44:31 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/jsuper2020.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "https://link.springer.com/article/10.1007/s11227-021-03853-x", acknowledgement = ack-nhfb, ajournal = "J. Supercomputing", fjournal = "The Journal of Supercomputing", journal-URL = "http://link.springer.com/journal/11227", } @Article{Anju:2022:MID, author = "M. A. Anju and Rupesh Nasre", title = "Multi-Interval {DomLock}: Toward Improving Concurrency in Hierarchies", journal = j-TOPC, volume = "9", number = "3", pages = "12:1--12:27", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3543543", ISSN = "2329-4949 (print), 2329-4957 (electronic)", ISSN-L = "2329-4949", bibdate = "Tue Sep 20 09:34:53 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/topc.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "https://dl.acm.org/doi/10.1145/3543543", abstract = "Locking has been a predominant technique depended upon for achieving thread synchronization and ensuring correctness in multi-threaded applications. It has been established that the concurrent applications working with hierarchical data witness \ldots{}", acknowledgement = ack-nhfb, articleno = "12", fjournal = "ACM Transactions on Parallel Computing", journal-URL = "https://dl.acm.org/loi/topc", } @Article{Bozkurt:2022:UCC, author = "Erkam Murat Bozkurt", title = "The usage of cybernetic in complex software systems and its application to the deterministic multithreading", journal = j-CCPE, volume = "34", number = "28", pages = "e7375:1--e7375:??", day = "25", month = dec, year = "2022", CODEN = "CCPEBO", DOI = "https://doi.org/10.1002/cpe.7375", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Wed Mar 15 08:11:28 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/ccpe2020.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, ajournal = "Concurr. Comput.", fjournal = "Concurrency and Computation: Practice and Experience", journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626", onlinedate = "31 October 2022", } @Article{Cheng:2022:EMA, author = "Jianyi Cheng and Shane T. Fleming and Yu Ting Chen and Jason Anderson and John Wickerson and George A. Constantinides", title = "Efficient Memory Arbitration in High-Level Synthesis From Multi-Threaded Code", journal = j-IEEE-TRANS-COMPUT, volume = "71", number = "4", pages = "933--946", month = apr, year = "2022", CODEN = "ITCOB4", DOI = "https://doi.org/10.1109/TC.2021.3066466", ISSN = "0018-9340 (print), 1557-9956 (electronic)", ISSN-L = "0018-9340", bibdate = "Thu Mar 17 06:38:17 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2020.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Computers", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12", } @Article{Eni:2022:EHB, author = "Yossi Eni and Shlomo Greenberg and Yehuda Ben-Shimol", title = "Efficient Hint-Based Event {(EHE)} Issue Scheduling for Hardware Multithreaded {RISC-V} Pipeline", journal = j-IEEE-TRANS-CIRCUITS-SYST-1, volume = "69", number = "2", pages = "735--745", year = "2022", DOI = "https://doi.org/10.1109/TCSI.2021.3117490", ISSN = "1549-8328 (print), 1558-0806 (electronic)", ISSN-L = "1549-8328", bibdate = "Sat Dec 16 15:51:40 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/risc-v.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Circuits and Systems I: Regular Papers", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=8919", } @Article{Feliu:2022:VVM, author = "Josu{\'e} Feliu and Ajeya Naithani and Julio Sahuquillo and Salvador Petit and Moinuddin Qureshi and Lieven Eeckhout", title = "{VMT}: Virtualized Multi-Threading for Accelerating Graph Workloads on Commodity Processors", journal = j-IEEE-TRANS-COMPUT, volume = "71", number = "6", pages = "1386--1398", month = jun, year = "2022", CODEN = "ITCOB4", DOI = "https://doi.org/10.1109/TC.2021.3086069", ISSN = "0018-9340 (print), 1557-9956 (electronic)", ISSN-L = "0018-9340", bibdate = "Wed May 25 09:41:19 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2020.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Computers", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12", } @Article{Inverso:2022:BVM, author = "Omar Inverso and Ermenegildo Tomasco and Bernd Fischer and Salvatore {La Torre} and Gennaro Parlato", title = "Bounded Verification of Multi-threaded Programs via Lazy Sequentialization", journal = j-TOPLAS, volume = "44", number = "1", pages = "1:1--1:50", month = mar, year = "2022", CODEN = "ATPSDT", DOI = "https://doi.org/10.1145/3478536", ISSN = "0164-0925 (print), 1558-4593 (electronic)", ISSN-L = "0164-0925", bibdate = "Fri Jan 14 06:53:13 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/toplas.bib", URL = "https://dl.acm.org/doi/10.1145/3478536", abstract = "Bounded verification techniques such as bounded model checking (BMC) have successfully been used for many practical program analysis problems, but concurrency still poses a challenge. Here, we describe a new approach to BMC of sequentially consistent \ldots{}", acknowledgement = ack-nhfb, articleno = "1", fjournal = "ACM Transactions on Programming Languages and Systems", journal-URL = "https://dl.acm.org/loi/toplas", } @Article{Kelefouras:2022:WSM, author = "Vasilios Kelefouras and Karim Djemame", title = "Workflow simulation and multi-threading aware task scheduling for heterogeneous computing", journal = j-J-PAR-DIST-COMP, volume = "168", number = "??", pages = "17--32", month = oct, year = "2022", CODEN = "JPDCER", DOI = "https://doi.org/10.1016/j.jpdc.2022.05.011", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Sat Jul 16 10:35:47 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/jpardistcomp2020.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.sciencedirect.com/science/article/pii/S0743731522001265", acknowledgement = ack-nhfb, fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315", } @Article{Korndorfer:2022:LDL, author = "Jonas H. M{\"u}ller Kornd{\"o}rfer and Ahmed Eleliemy and Ali Mohammed and Florina M. Ciorba", title = "{LB4OMP}: a Dynamic Load Balancing Library for Multithreaded Applications", journal = j-IEEE-TRANS-PAR-DIST-SYS, volume = "33", number = "4", pages = "830--841", month = apr, year = "2022", CODEN = "ITDSEO", DOI = "https://doi.org/10.1109/TPDS.2021.3107775", ISSN = "1045-9219 (print), 1558-2183 (electronic)", ISSN-L = "1045-9219", bibdate = "Thu Nov 11 08:39:34 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Parallel and Distributed Systems", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=71", } @Article{Minutoli:2022:PSH, author = "Marco Minutoli and Vito Giovanni Castellana and Nicola Saporetti and Stefano Devecchi and Marco Lattuada and Pietro Fezzardi and Antonino Tumeo and Fabrizio Ferrandi", title = "\pkg{Svelto}: High-Level Synthesis of Multi-Threaded Accelerators for Graph Analytics", journal = j-IEEE-TRANS-COMPUT, volume = "71", number = "3", pages = "520--533", month = mar, year = "2022", CODEN = "ITCOB4", DOI = "https://doi.org/10.1109/TC.2021.3057860", ISSN = "0018-9340 (print), 1557-9956 (electronic)", ISSN-L = "0018-9340", bibdate = "Thu Feb 17 08:09:56 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2020.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Computers", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12", } @Article{Myllykoski:2022:ATB, author = "Mirko Myllykoski", title = "{Algorithm 1019}: a Task-based Multi-shift {$ Q R $ \slash $ Q Z $} Algorithm with Aggressive Early Deflation", journal = j-TOMS, volume = "48", number = "1", pages = "11:1--11:36", month = mar, year = "2022", CODEN = "ACMSCU", DOI = "https://doi.org/10.1145/3495005", ISSN = "0098-3500 (print), 1557-7295 (electronic)", ISSN-L = "0098-3500", bibdate = "Thu Feb 17 08:00:57 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/toms.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "https://dl.acm.org/doi/10.1145/3495005", abstract = "The $ Q R $ algorithm is one of the three phases in the process of computing the eigenvalues and the eigenvectors of a dense nonsymmetric matrix. This paper describes a task-based $ Q R $ algorithm for reducing an upper Hessenberg matrix to real Schur form. The task-based algorithm also supports generalized eigenvalue problems ($ Q Z $ algorithm) but this paper concentrates on the standard case. The task-based algorithm adopts previous algorithmic improvements, such as tightly-coupled multi-shifts and Aggressive Early Deflation (AED), and also incorporates several new ideas that significantly improve the performance. This includes, but is not limited to, the elimination of several synchronization points, the dynamic merging of previously separate computational steps, the shortening and the prioritization of the critical path, and experimental GPU support. The task-based implementation is demonstrated to be multiple times faster than multi-threaded LAPACK and ScaLAPACK in both single-node and multi-node configurations on two different machines based on Intel and AMD CPUs. The implementation is built on top of the StarPU runtime system and is part of the open-source StarNEig library.", acknowledgement = ack-nhfb, articleno = "11", fjournal = "ACM Transactions on Mathematical Software (TOMS)", journal-URL = "https://dl.acm.org/loi/toms", } @Article{Perrin:2022:EWF, author = "Matthieu Perrin and Achour Most{\'e}faoui and Ludmila Courtillat-Piazza", title = "Extending the wait-free hierarchy to multi-threaded systems", journal = j-DISTRIB-COMPUT, volume = "35", number = "4", pages = "375--398", month = aug, year = "2022", CODEN = "DICOEB", DOI = "https://doi.org/10.1007/s00446-022-00425-x", ISSN = "0178-2770 (print), 1432-0452 (electronic)", ISSN-L = "0178-2770", bibdate = "Mon Aug 1 08:49:35 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/distribcomput.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "https://link.springer.com/article/10.1007/s00446-022-00425-x", acknowledgement = ack-nhfb, ajournal = "Distrib. comput.", fjournal = "Distributed Computing", journal-URL = "https://link.springer.com/journal/446", } @Article{Pons:2022:EHT, author = "Luc{\'{\i}}a Pons and Josu{\'e} Feliu and Jos{\'e} Puche and Chaoyi Huang and Salvador Petit and Julio Pons and Mar{\'{\i}}a E. G{\'o}mez and Julio Sahuquillo", title = "Effect of Hyper-Threading in Latency-Critical Multithreaded Cloud Applications and Utilization Analysis of the Major System Resources", journal = j-FUT-GEN-COMP-SYS, volume = "131", number = "??", pages = "194--208", month = jun, year = "2022", CODEN = "FGSEVI", DOI = "https://doi.org/10.1016/j.future.2022.01.025", ISSN = "0167-739X (print), 1872-7115 (electronic)", ISSN-L = "0167-739X", bibdate = "Wed Mar 9 17:27:32 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/futgencompsys2020.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.sciencedirect.com/science/article/pii/S0167739X22000334", acknowledgement = ack-nhfb, fjournal = "Future Generation Computer Systems", journal-URL = "http://www.sciencedirect.com/science/journal/0167739X", } @Article{Raad:2022:EIX, author = "Azalea Raad and Luc Maranget and Viktor Vafeiadis", title = "Extending {Intel-x86} consistency and persistency: formalising the semantics of {Intel-x86} memory types and non-temporal stores", journal = j-PACMPL, volume = "6", number = "POPL", pages = "22:1--22:31", month = jan, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3498683", ISSN = "2475-1421 (electronic)", ISSN-L = "2475-1421", bibdate = "Thu May 26 06:32:48 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/pacmpl.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "https://dl.acm.org/doi/10.1145/3498683", abstract = "Existing semantic formalisations of the Intel-x86 architecture cover only a small fragment of its available features that are relevant for the consistency semantics of multi-threaded programs as well as the persistency semantics of programs interfacing \ldots{}", acknowledgement = ack-nhfb, articleno = "22", fjournal = "Proceedings of the ACM on Programming Languages (PACMPL)", journal-URL = "https://dl.acm.org/loi/pacmpl", } @Article{Ritchie:2022:DPF, author = "Robert Ritchie and Khodakhast Bibak", title = "\pkg{DOTMIX-Pro}: faster and more efficient variants of {DOTMIX} for dynamic-multithreading platforms", journal = j-J-SUPERCOMPUTING, volume = "78", number = "1", pages = "945--961", month = jan, year = "2022", CODEN = "JOSUED", DOI = "https://doi.org/10.1007/s11227-021-03904-3", ISSN = "0920-8542 (print), 1573-0484 (electronic)", ISSN-L = "0920-8542", bibdate = "Mon Feb 28 16:44:33 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/jsuper2020.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "https://link.springer.com/article/10.1007/s11227-021-03904-3", acknowledgement = ack-nhfb, ajournal = "J. Supercomputing", fjournal = "The Journal of Supercomputing", journal-URL = "http://link.springer.com/journal/11227", } @Article{Rodriguez:2022:EHB, author = "Alfonso Rodr{\'\i}guez and Andr{\'e}s Otero and Marco Platzner and Eduardo de la Torre", title = "Exploiting Hardware-Based Data-Parallel and Multithreading Models for Smart Edge Computing in Reconfigurable {FPGAs}", journal = j-IEEE-TRANS-COMPUT, volume = "71", number = "11", pages = "2903--2914", month = nov, year = "2022", CODEN = "ITCOB4", DOI = "https://doi.org/10.1109/TC.2021.3107196", ISSN = "0018-9340 (print), 1557-9956 (electronic)", ISSN-L = "0018-9340", bibdate = "Thu Oct 27 15:52:25 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranscomput2020.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Computers", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=12", } @Article{Schwab:2022:SSV, author = "Michail Schwab and David Saffo and Nicholas Bond and Shash Sinha and Cody Dunne and Jeff Huang and James Tompkin and Michelle A. Borkin", title = "Scalable Scalable Vector Graphics: Automatic Translation of Interactive {SVGs} to a Multithread {VDOM} for Fast Rendering", journal = j-IEEE-TRANS-VIS-COMPUT-GRAPH, volume = "28", number = "9", pages = "3219--3234", month = sep, year = "2022", CODEN = "ITVGEA", DOI = "https://doi.org/10.1109/TVCG.2021.3059294", ISSN = "1077-2626", ISSN-L = "1077-2626", bibdate = "Thu Aug 4 06:28:31 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetransviscomputgraph2020.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Visualization and Computer Graphics", journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=2945", } @Article{So:2022:EES, author = "Hwisoo So and Moslem Didehban and Yohan Ko and Aviral Shrivastava and Kyoungwoo Lee", title = "{EXPERTISE}: an Effective Software-level Redundant Multithreading Scheme against Hardware Faults", journal = j-TACO, volume = "19", number = "4", pages = "53:1--53:??", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3546073", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Dec 8 06:39:05 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3546073", abstract = "Error resilience is the primary design concern for safety- and mission-critical applications. Redundant MultiThreading (RMT) is one of the most promising soft and hard \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "53", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Trotter:2022:MTO, author = "James D. Trotter and Xing Cai and Simon W. Funke", title = "On Memory Traffic and Optimisations for Low-order Finite Element Assembly Algorithms on Multi-core {CPUs}", journal = j-TOMS, volume = "48", number = "2", pages = "19:1--19:31", month = jun, year = "2022", CODEN = "ACMSCU", DOI = "https://doi.org/10.1145/3503925", ISSN = "0098-3500 (print), 1557-7295 (electronic)", ISSN-L = "0098-3500", bibdate = "Wed Jul 20 07:04:17 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/toms.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "https://dl.acm.org/doi/10.1145/3503925", abstract = "Motivated by the wish to understand the achievable performance of finite element assembly on unstructured computational meshes, we dissect the standard cellwise assembly algorithm into four kernels, two of which are dominated by irregular memory traffic. Several optimisation schemes are studied together with associated lower and upper bounds on the estimated memory traffic volume. Apart from properly reordering the mesh entities, the two most significant optimisations include adopting a lookup table in adding element matrices or vectors to their global counterparts, and using a row-wise assembly algorithm for multi-threaded parallelisation. Rigorous benchmarking shows that, due to the various optimisations, the actual volumes of memory traffic are in many cases very close to the estimated lower bounds. These results confirm the effectiveness of the optimisations, while also providing a recipe for developing efficient software for finite element assembly.", acknowledgement = ack-nhfb, articleno = "19", fjournal = "ACM Transactions on Mathematical Software (TOMS)", journal-URL = "https://dl.acm.org/loi/toms", } @Article{Wang:2022:ASM, author = "Zhe Wang and Chen Xu and Kunal Agrawal and Jing Li", title = "Adaptive scheduling of multiprogrammed dynamic-multithreading applications", journal = j-J-PAR-DIST-COMP, volume = "162", number = "??", pages = "76--88", month = apr, year = "2022", CODEN = "JPDCER", DOI = "https://doi.org/10.1016/j.jpdc.2022.01.009", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Feb 10 06:39:27 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/jpardistcomp.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.sciencedirect.com/science/article/pii/S0743731522000144", acknowledgement = ack-nhfb, fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315", } @Misc{Yee:2022:CMT, author = "Alexander J. Yee", title = "{{\tt y-cruncher}} --- a multi-threaded pi-program", howpublished = "Web site", day = "13", month = oct, year = "2022", bibdate = "Mon Dec 05 08:24:08 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/pi.bib", URL = "http://www.numberworld.org/y-cruncher/", abstract = "y-cruncher is a program that can compute Pi and other constants to trillions of digits. It is the first of its kind that is multi-threaded and scalable to multi-core systems. Ever since its launch in 2009, it has become a common benchmarking and stress-testing application for overclockers and hardware enthusiasts.", acknowledgement = ack-nhfb, remark = "From the Web site:\\ 105 trillion digits - February 2024 (Jordan Ranous, Kevin O'Brien, and Brian Beeler) \\ 100 trillion digits - June 2022 (Emma Haruka Iwao) \\ 62.8 trillion digits - August 2021 (UAS Grisons) \\ 50 trillion digits - January 2020 (Timothy Mullican) \\ 31.4 trillion digits - January 2019 (Emma Haruka Iwao) \\ 22.4 trillion digits - November 2016 (Peter Trueb) \\ 13.3 trillion digits - October 2014 (Sandon Van Ness `houkouonchi'') \\ 12.1 trillion digits - December 2013 (Shigeru Kondo) \\ 10 trillion digits - October 2011 (Shigeru Kondo) \\ 5 trillion digits - August 2010 (Shigeru Kondo)", } @Article{Zou:2022:BSP, author = "Changwei Zou and Xudong Wang and Yaoqing Gao and Jingling Xue", title = "Buddy Stacks: Protecting Return Addresses with Efficient Thread-Local Storage and Runtime Re-Randomization", journal = j-TOSEM, volume = "31", number = "2", pages = "35e:1--35e:37", month = apr, year = "2022", CODEN = "ATSMER", DOI = "https://doi.org/10.1145/3494516", ISSN = "1049-331X (print), 1557-7392 (electronic)", ISSN-L = "1049-331X", bibdate = "Tue May 24 07:09:20 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/tosem.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "https://dl.acm.org/doi/10.1145/3494516", abstract = "Shadow stacks play an important role in protecting return addresses to mitigate ROP attacks. Parallel shadow stacks, which shadow the call stack of each thread at the same constant offset for all threads, are known not to support multi-threading well. On \ldots{}", acknowledgement = ack-nhfb, articleno = "35e", fjournal = "ACM Transactions on Software Engineering and Methodology", journal-URL = "https://dl.acm.org/loi/tosem", } @Article{Zou:2022:PSB, author = "Changwei Zou and Yaoqing Gao and Jingling Xue", title = "Practical Software-Based Shadow Stacks on x86-64", journal = j-TACO, volume = "19", number = "4", pages = "61:1--61:??", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3556977", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Dec 8 06:39:05 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3556977", abstract = "Control-Flow Integrity (CFI) techniques focus often on protecting forward edges and assume that backward edges are protected by shadow stacks. However, software-based shadow stacks that can provide performance, security, and compatibility are still hard to obtain, leaving an important security gap on x86-64. In this article, we introduce a simple, efficient, and effective parallel shadow stack design (based on LLVM), FlashStack, for protecting return addresses in single- and multi-threaded programs running under 64-bit Linux on x86-64, with three distinctive features. First, we introduce a novel dual-prologue approach to enable a protected function to thwart the TOCTTOU attacks, which are constructed by Microsoft's red team and lead to the deprecation of Microsoft's RFG. Second, we design a new mapping mechanism, Segment+Rsp-S, to allow the parallel shadow stack to be accessed efficiently while satisfying the constraints of arch\_prctl() and ASLR in 64-bit Linux. Finally, we introduce a lightweight inspection mechanism, SideChannel-K, to harden FlashStack further by detecting entropy-reduction attacks efficiently and protecting the parallel shadow stack effectively with a 10-ms shuffling policy. Our evaluation on SPEC CPU2006, Nginx, and Firefox shows that FlashStack can provide high performance, meaningful security, and reasonable compatibility for server- and client-side programs on x86-64.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "61", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Alsaker:2023:MRT, author = "Melody Alsaker and Jennifer L. Mueller and Andreas Stahel", title = "A multithreaded real-time solution for {$2$D} {EIT} reconstruction with the {D-bar} algorithm", journal = j-J-COMPUT-SCI, volume = "67", pages = "??--??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1016/j.jocs.2023.101967", ISSN = "1877-7503 (print), 1877-7511 (electronic)", ISSN-L = "1877-7503", bibdate = "Tue Sep 19 13:56:38 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jcomputsci.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "https://www.sciencedirect.com/science/article/pii/S1877750323000273", acknowledgement = ack-nhfb, ajournal = "J. Comput. Sci.", articleno = "101967", fjournal = "Journal of Computational Science", journal-URL = "https://www.sciencedirect.com/journal/journal-of-computational-science", } @Article{Arslan:2023:ETC, author = "Sanem Arslan and Osman {\"U}nsal", title = "Efficient thread-to-core mapping alternatives for application-level redundant multithreading", journal = j-CCPE, volume = "35", number = "24", pages = "e7622:1--e7622:??", day = "1", month = nov, year = "2023", CODEN = "CCPEBO", DOI = "https://doi.org/10.1002/cpe.7622", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Tue May 28 09:32:25 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/ccpe2020.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, ajournal = "Concurr. Comput.", fjournal = "Concurrency and Computation: Practice and Experience", journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626", onlinedate = "18 January 2023", } @Article{Czarnul:2023:MCO, author = "Pawe{\l} Czarnul", title = "A multithreaded {CUDA} and {OpenMP} based power-aware programming framework for multi-node {GPU} systems", journal = j-CCPE, volume = "35", number = "25", pages = "e7897:1--e7897:??", day = "15", month = nov, year = "2023", CODEN = "CCPEBO", DOI = "https://doi.org/10.1002/cpe.7897", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Tue May 28 09:32:26 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/ccpe2020.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/pvm.bib", acknowledgement = ack-nhfb, ajournal = "Concurr. Comput.", fjournal = "Concurrency and Computation: Practice and Experience", journal-URL = "http://www.interscience.wiley.com/jpages/1532-0626", onlinedate = "29 August 2023", } @Article{Do:2023:ITS, author = "An Do and Wei-Ming Lin", title = "Intelligent Thread-Specific Rename Register Allocation for Simultaneous Multi-Threading Processors Based on Cache Behavior", journal = j-INT-J-COMP-APPL, volume = "185", number = "??", pages = "1--9", month = aug, year = "2023", CODEN = "????", DOI = "https://doi.org/10.5120/ijca2023923037", ISSN = "0975-8887", ISSN-L = "0975-8887", bibdate = "Fri Jan 24 09:49:13 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/intjcompappl.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "https://www.ijcaonline.org/archives/volume185/number29/32873-2023923037/", acknowledgement = ack-nhfb, ajournal = "Intern. J. of Computer Applications", articleno = "29", fjournal = "International Journal of Computer Applications", journal-URL = "https://www.ijcaonline.org/", } @Article{Fan:2023:SMO, author = "Hongyu Fan and Zhihang Sun and Fei He", title = "Satisfiability Modulo Ordering Consistency Theory for {SC}, {TSO}, and {PSO} Memory Models", journal = j-TOPLAS, volume = "45", number = "1", pages = "6:1--6:??", month = mar, year = "2023", CODEN = "ATPSDT", DOI = "https://doi.org/10.1145/3579835", ISSN = "0164-0925 (print), 1558-4593 (electronic)", ISSN-L = "0164-0925", bibdate = "Thu Mar 9 07:36:32 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/toplas.bib", URL = "https://dl.acm.org/doi/10.1145/3579835", abstract = "Automatically verifying multi-threaded programs is difficult because of the vast number of thread interleavings, a problem aggravated by weak memory consistency. Partial orders can help with verification because they can represent many thread \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Program. Lang. Syst.", articleno = "6", fjournal = "ACM Transactions on Programming Languages and Systems", journal-URL = "https://dl.acm.org/loi/toplas", } @Article{Feliu:2023:SIT, author = "Josu{\'e} Feliu and Alberto Ros and Manuel E. Acacio and Stefanos Kaxiras", title = "Speculative inter-thread store-to-load forwarding in {SMT} architectures", journal = j-J-PAR-DIST-COMP, volume = "173", number = "??", pages = "94--106", month = mar, year = "2023", CODEN = "JPDCER", DOI = "https://doi.org/10.1016/j.jpdc.2022.11.007", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Wed Dec 21 08:24:26 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/jpardistcomp2020.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.sciencedirect.com/science/article/pii/S0743731522002349", acknowledgement = ack-nhfb, fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315", } @Article{Kielanski:2023:PAW, author = "Grzegorz Kielanski and Benny {Van Houdt}", title = "Performance Analysis of Work Stealing Strategies in Large-Scale Multithreaded Computing", journal = j-TOMACS, volume = "33", number = "4", pages = "15:1--15:??", month = oct, year = "2023", CODEN = "ATMCEZ", DOI = "https://doi.org/10.1145/3584186", ISSN = "1049-3301 (print), 1558-1195 (electronic)", ISSN-L = "1049-3301", bibdate = "Fri Dec 8 08:39:31 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/tomacs.bib", URL = "https://dl.acm.org/doi/10.1145/3584186", abstract = "Distributed systems use randomized work stealing to improve performance and resource utilization. In most prior analytical studies of randomized work stealing, jobs are considered to be sequential and are executed as a whole on a single server. In this \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Model. Comput. Simul.", articleno = "15", fjournal = "ACM Transactions on Modeling and Computer Simulation", journal-URL = "https://dl.acm.org/loi/tomacs", } @Article{Lu:2023:EVM, author = "Jiayin Lu and Emanuel A. Lazar and Chris H. Rycroft", title = "An extension to {Voro++} for multithreaded computation of {Voronoi} cells", journal = j-COMP-PHYS-COMM, volume = "291", number = "??", pages = "??--??", month = oct, year = "2023", CODEN = "CPHCBZ", DOI = "https://doi.org/10.1016/j.cpc.2023.108832", ISSN = "0010-4655 (print), 1879-2944 (electronic)", ISSN-L = "0010-4655", bibdate = "Thu Aug 10 07:51:47 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/compphyscomm2020.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.sciencedirect.com/science/article/pii/S0010465523001777", acknowledgement = ack-nhfb, articleno = "108832", fjournal = "Computer Physics Communications", journal-URL = "http://www.sciencedirect.com/science/journal/00104655", } @Article{Manis:2023:MAC, author = "George Manis and Dimitrios Bakalis and Roberto Sassi", title = "A Multithreaded Algorithm for the Computation of Sample Entropy", journal = j-ALGORITHMS-BASEL, volume = "16", number = "6", month = jun, year = "2023", CODEN = "ALGOCH", DOI = "https://doi.org/10.3390/a16060299", ISSN = "1999-4893 (electronic)", ISSN-L = "1999-4893", bibdate = "Thu Jun 29 06:31:13 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/algorithms.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "https://www.mdpi.com/1999-4893/16/6/299", acknowledgement = ack-nhfb, articleno = "299", fjournal = "Algorithms (Basel)", journal-URL = "https://www.mdpi.com/journal/algorithms", pagecount = "??", } @Article{Parizek:2023:CJP, author = "Pavel Par{\'\i}zek and Filip Kliber", title = "Checking Just Pairs of Threads for Efficient and Scalable Incremental Verification of Multithreaded Programs", journal = j-SIGSOFT, volume = "48", number = "1", pages = "27--31", month = jan, year = "2023", CODEN = "SFENDP", DOI = "https://doi.org/10.1145/3573074.3573082", ISSN = "0163-5948 (print), 1943-5843 (electronic)", ISSN-L = "0163-5948", bibdate = "Mon Mar 25 12:02:24 MDT 2024", bibsource = "http://www.math.utah.edu/pub/tex/bib/sigsoft2020.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "https://dl.acm.org/doi/10.1145/3573074.3573082", abstract = "Many techniques of automated verification target multithreaded programs, because subtle interactions between threads may trigger concurrency errors such as deadlocks and data races. However, techniques and tools involving systematic exploration of the \ldots{}", acknowledgement = ack-nhfb, fjournal = "ACM SIGSOFT Software Engineering Notes", journal-URL = "https://dl.acm.org/loi/sigsoft", } @Article{Que:2023:RRM, author = "Zhiqiang Que and Hiroki Nakahara and Hongxiang Fan and He Li and Jiuxi Meng and Kuen Hung Tsoi and Xinyu Niu and Eriko Nurvitadhi and Wayne Luk", title = "{Remarn}: a Reconfigurable Multi-threaded Multi-core Accelerator for Recurrent Neural Networks", journal = j-TRETS, volume = "16", number = "1", pages = "4:1--4:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3534969", ISSN = "1936-7406 (print), 1936-7414 (electronic)", ISSN-L = "1936-7406", bibdate = "Sat Mar 11 08:27:18 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/trets.bib", URL = "https://dl.acm.org/doi/10.1145/3534969", abstract = "This work introduces Remarn, a reconfigurable multi-threaded multi-core accelerator supporting both spatial and temporal co-execution of Recurrent Neural Network (RNN) inferences. It increases processing capabilities and quality of service of cloud-based \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Reconfigurable Technol. Syst.", articleno = "4", fjournal = "ACM Transactions on Reconfigurable Technology and Systems (TRETS)", journal-URL = "https://dl.acm.org/loi/trets", } @Article{Wang:2023:ADR, author = "Weidong Wang and Dian Li and Wangda Luo and Yujian Kang and Liqiang Wang", title = "Anthropomorphic diagnosis of runtime hidden behaviors in {OpenMP} multi-threaded applications", journal = j-J-PAR-DIST-COMP, volume = "177", number = "??", pages = "17--27", month = jul, year = "2023", CODEN = "JPDCER", DOI = "https://doi.org/10.1016/j.jpdc.2023.02.012", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Tue Apr 18 07:25:10 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/jpardistcomp2020.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/pvm.bib", URL = "http://www.sciencedirect.com/science/article/pii/S0743731523000333", acknowledgement = ack-nhfb, fjournal = "Journal of Parallel and Distributed Computing", journal-URL = "http://www.sciencedirect.com/science/journal/07437315", } @Article{Al-sudani:2024:MBA, author = "Ahlam Hanoon Al-sudani and Basheera M. Mahmmod and Firas A. Sabir and Sadiq H. Abdulhussain and Muntadher Alsabah and Wameedh Nazar Flayyih", title = "Multithreading-Based Algorithm for High-Performance {Tchebichef} Polynomials with Higher Orders", journal = j-ALGORITHMS-BASEL, volume = "17", number = "9", month = sep, year = "2024", CODEN = "ALGOCH", DOI = "https://doi.org/10.3390/a17090381", ISSN = "1999-4893 (electronic)", ISSN-L = "1999-4893", bibdate = "Mon Sep 30 08:06:04 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/algorithms.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "https://www.mdpi.com/1999-4893/17/9/381", acknowledgement = ack-nhfb, articleno = "381", fjournal = "Algorithms (Basel)", journal-URL = "https://www.mdpi.com/journal/algorithms", pagecount = "??", } @Article{Hsu:2024:SHM, author = "Kuan-Chieh Hsu and Hung-Wei Tseng", title = "Simultaneous and Heterogeneous Multithreading: Exploiting Simultaneous and Heterogeneous Parallelism in Accelerator-Rich Architectures", journal = j-IEEE-MICRO, volume = "44", number = "4", pages = "11--19", month = jul # "\slash " # aug, year = "2024", CODEN = "IEMIDZ", DOI = "https://doi.org/10.1109/MM.2024.3414941", ISSN = "0272-1732 (print), 1937-4143 (electronic)", ISSN-L = "0272-1732", bibdate = "Sat Aug 24 10:43:43 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeemicro.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Micro", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=40", keywords = "Codes; Computational modeling; Graphics processing units; Hardware acceleration; Instruction sets; Kernel; Runtime", } @Article{Langr:2024:PMD, author = "Daniel Langr and Tom{\'a}s Dytrych", title = "Parallel multithreaded deduplication of data sequences in nuclear structure calculations", journal = j-IJHPCA, volume = "38", number = "1", pages = "5--16", day = "1", month = jan, year = "2024", CODEN = "IHPCFL", DOI = "https://doi.org/10.1177/10943420231183697", ISSN = "1094-3420 (print), 1741-2846 (electronic)", ISSN-L = "1094-3420", bibdate = "Thu May 30 07:31:50 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/ijsa.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "https://journals.sagepub.com/doi/abs/10.1177/10943420231183697", acknowledgement = ack-nhfb, ajournal = "Int. J. High Perform. Comput. Appl.", fjournal = "International Journal of High Performance Computing Applications", journal-URL = "https://journals.sagepub.com/home/hpc", ORCID-numbers = "https://orcid.org/0000-0001-9760-7068", } @Article{Li:2024:SSQ, author = "Peixuan Li and Ping Xie and Qiang Cao", title = "{SSRAID}: a Stripe-Queued and Stripe-Threaded Merging {I/O} Strategy to Improve Write Performance of Serial Interface {SSD} {RAID}", journal = j-IEEE-TRANS-PAR-DIST-SYS, volume = "35", number = "10", pages = "1841--1853", month = oct, year = "2024", CODEN = "ITDSEO", DOI = "https://doi.org/10.1109/TPDS.2024.3443083", ISSN = "1045-9219 (print), 1558-2183 (electronic)", ISSN-L = "1045-9219", bibdate = "Wed Sep 25 15:15:27 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeetranspardistsys2020.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, ajournal = "IEEE Trans. Parallel Distrib. Syst.", fjournal = "IEEE Transactions on Parallel and Distributed Systems", journal-URL = "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=71", keywords = "Instruction sets; Linux; Merging; multi-thread; Performance evaluation; redundant array of independent disks; request scheduling management; Software; Solid state drive; stage buffer; Throughput; Writing", } @Article{Liu:2024:PSS, author = "Changxi Liu and Alen Sabu and Akanksha Chaudhari and Qingxuan Kang and Trevor E. Carlson", title = "{Pac-Sim}: Simulation of Multi-threaded Workloads using Intelligent, Live Sampling", journal = j-TACO, volume = "21", number = "4", pages = "81:1--81:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3680548", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Nov 22 08:21:31 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3680548", abstract = "High-performance, multi-core processors are the key to accelerating workloads in several application domains. To continue to scale performance at the limit of Moore's Law and Dennard scaling, software and hardware designers have turned to dynamic \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "81", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Watanabe:2024:CDP, author = "Yutaka Watanabe and Miwako Tsuji and Hitoshi Murai and Taisuke Boku and Mitsuhisa Sato", title = "Correction: Design and performance evaluation of {UCX} for the {Tofu Interconnect D} on {Fugaku} towards efficient multithreaded communication", journal = j-J-SUPERCOMPUTING, volume = "80", number = "17", pages = "25710--25710", month = nov, year = "2024", CODEN = "JOSUED", DOI = "https://doi.org/10.1007/s11227-024-06421-1", ISSN = "0920-8542 (print), 1573-0484 (electronic)", ISSN-L = "0920-8542", bibdate = "Fri Sep 6 07:04:56 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/jsuper2020.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "See \cite{Watanabe:2024:DPE}.", URL = "https://link.springer.com/article/10.1007/s11227-024-06421-1", acknowledgement = ack-nhfb, ajournal = "J. Supercomputing", fjournal = "The Journal of Supercomputing", journal-URL = "http://link.springer.com/journal/11227", } @Article{Watanabe:2024:DPE, author = "Yutaka Watanabe and Miwako Tsuji and Hitoshi Murai and Taisuke Boku and Mitsuhisa Sato", title = "Design and performance evaluation of {UCX} for the {Tofu Interconnect D} on {Fugaku} towards efficient multithreaded communication", journal = j-J-SUPERCOMPUTING, volume = "80", number = "14", pages = "20715--20742", month = sep, year = "2024", CODEN = "JOSUED", DOI = "https://doi.org/10.1007/s11227-024-06201-x", ISSN = "0920-8542 (print), 1573-0484 (electronic)", ISSN-L = "0920-8542", bibdate = "Sat Aug 3 09:46:55 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/jsuper2020.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "See correction \cite{Watanabe:2024:CDP}.", URL = "https://link.springer.com/article/10.1007/s11227-024-06201-x", acknowledgement = ack-nhfb, ajournal = "J. Supercomputing", fjournal = "The Journal of Supercomputing", journal-URL = "http://link.springer.com/journal/11227", } @Article{Lu:2025:TMT, author = "Jiayin Lu and Chris H. Rycroft", title = "\pkg{TriMe++} : Multi-threaded triangular meshing in two dimensions", journal = j-COMP-PHYS-COMM, volume = "308", number = "??", pages = "??--??", month = mar, year = "2025", CODEN = "CPHCBZ", DOI = "https://doi.org/10.1016/j.cpc.2024.109442", ISSN = "0010-4655 (print), 1879-2944 (electronic)", ISSN-L = "0010-4655", bibdate = "Tue Dec 31 12:14:09 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/compphyscomm2020.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.sciencedirect.com/science/article/pii/S0010465524003655", acknowledgement = ack-nhfb, articleno = "109442", fjournal = "Computer Physics Communications", journal-URL = "http://www.sciencedirect.com/science/journal/00104655", } @Article{Magalhaes:2025:IVP, author = "Tiago E. C. Magalh{\~a}es", title = "An improved version of {PyWolf} with multithread-based parallelism support", journal = j-COMP-PHYS-COMM, volume = "307", number = "??", pages = "??--??", month = feb, year = "2025", CODEN = "CPHCBZ", DOI = "https://doi.org/10.1016/j.cpc.2024.109431", ISSN = "0010-4655 (print), 1879-2944 (electronic)", ISSN-L = "0010-4655", bibdate = "Fri Dec 6 05:59:54 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/compphyscomm2020.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/python.bib", URL = "http://www.sciencedirect.com/science/article/pii/S0010465524003540", acknowledgement = ack-nhfb, articleno = "109431", fjournal = "Computer Physics Communications", journal-URL = "http://www.sciencedirect.com/science/journal/00104655", } %%% ==================================================================== %%% Cross-referenced entries must come last: @Proceedings{IEEE:1989:WOS, editor = "{IEEE}", booktitle = "Workstation Operating Systems: Proceedings of the Second Workshop on Workstation Operating Systems (WWOS-II), Pacific Grove, CA, USA, September 27--29, 1989", title = "Workstation Operating Systems: Proceedings of the Second Workshop on Workstation Operating Systems ({WWOS}-{II}), Pacific Grove, {CA}, {USA}, September 27--29, 1989", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "xi + 134", year = "1989", bibdate = "Sat Sep 28 20:21:01 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/mach.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "IEEE catalog number 89TH0281-6.", acknowledgement = ack-nhfb, classification = "B0100 (General electrical engineering topics); B6210L (Computer communications); C5430 (Microcomputers); C5620 (Computer networks and techniques); C5630 (Networking equipment); C6120 (File organisation); C6150J (Operating systems); C6155 (Computer communications software)", confsponsor = "IEEE", keywords = "AIX3; At-most-once message; Coda file system; Echo distributed file system; Fault-tolerant multiprocessor workstations; File implementation; File-server statelessness; Global communication interface; Guide operating system; Large-scale applications; Mach; Multimedia applications; Object-oriented environments; Open operating system; Parallel algorithms; PLURIX; PROST; Prototype information environment; Raven project; Replicated servers; Shared memory; Sprite; Synchronized clocks; Ubik database; Very large distributed systems; Virtual memory; Virtual systems; Workstation networks; Workstation-network communication interface; X-kernel", thesaurus = "Computer communications software; Computer networks; File organisation; File servers; Operating systems [computers]; Workstations", } @Proceedings{USENIX:1989:PWU, editor = "{USENIX Association}", booktitle = "Proceedings of the Winter 1989 {USENIX} Conference: January 30--February 3, 1989, San Diego, California, {USA}", title = "Proceedings of the Winter 1989 {USENIX} Conference: January 30--February 3, 1989, San Diego, California, {USA}", publisher = pub-USENIX, address = pub-USENIX:adr, pages = "x + 471", year = "1989", bibdate = "Sun Feb 18 07:46:09 MST 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "UNIX (Computer operating system) --- Congresses.", } @Proceedings{Anonymous:1990:PWU, editor = "Anonymous", booktitle = "Proceedings of the Winter 1990 USENIX Conference, Washington, DC, USA, January 22--26, 1990", title = "Proceedings of the Winter 1990 {USENIX} Conference, Washington, {DC}, {USA}, January 22--26, 1990", publisher = pub-USENIX, address = pub-USENIX:adr, pages = "xvi + 374", year = "1990", bibdate = "Sat Sep 28 20:03:34 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Proceedings{IEEE:1990:PSN, editor = "{IEEE}", booktitle = "Proceedings, Supercomputing '90: November 12--16, 1990, New York Hilton at Rockefeller Center, New York, New York", title = "Proceedings, Supercomputing '90: November 12--16, 1990, New York Hilton at Rockefeller Center, New York, New York", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "xxv + 982", year = "1990", ISBN = "0-8186-2056-0 (paperback: IEEE Computer Society), 0-89791-412-0 (paperback: ACM)", ISBN-13 = "978-0-8186-2056-0 (paperback: IEEE Computer Society), 978-0-89791-412-3 (paperback: ACM)", LCCN = "QA 76.88 S87 1990", bibdate = "Wed Aug 28 06:48:31 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; University of California MELVYL catalog.", note = "ACM order number 415903. IEEE Computer Society Press order number 2056. IEEE catalog number 90CH2916-5.", acknowledgement = ack-nhfb, classification = "C5440 (Multiprocessor systems and techniques); C5470 (Performance evaluation and testing); C6110 (Systems analysis and programming); C7000 (Computer applications)", keywords = "biological applications; computer applications; computer chess; innovative architectures; linear algebra algorithms; memory; networking computing; parallel languages; parallel processing; particle transport; partitioning; performance evaluation; performance visualizations; pipeline processing; program analysis; program restructuring; scheduling; supercomputers --- congresses; vector algorithms", } @Proceedings{Anonymous:1991:PIS, editor = "Anonymous", booktitle = "{Proceedings of the International Symposium on Supercomputing: Fukuoka, Japan, November 6--8, 1991}", title = "{Proceedings of the International Symposium on Supercomputing: Fukuoka, Japan, November 6--8, 1991}", publisher = "Kyushu University Press", address = "Fukuoka, Japan", pages = "iv + 261", year = "1991", ISBN = "4-87378-284-8", ISBN-13 = "978-4-87378-284-3", LCCN = "QA76.88.I1991", bibdate = "Fri Aug 30 08:01:51 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Supercomputers --- Congresses", } @Proceedings{USENIX:1991:PUM, editor = "{USENIX}", booktitle = "Proceedings of the {USENIX} Mach Symposium: November 20--22, 1991, Monterey, California, USA", title = "Proceedings of the {USENIX} Mach Symposium: November 20--22, 1991, Monterey, California, {USA}", publisher = pub-USENIX, address = pub-USENIX:adr, pages = "262", year = "1991", LCCN = "QAX 27", bibdate = "Sun Feb 18 07:46:09 MST 1996", bibsource = "ftp://ftp.uu.net/library/bibliography; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Memory management (Computer science) --- Congresses; Operating systems (Computers) --- Congresses; UNIX (Computer file) --- Congresses", } @Proceedings{USENIX:1991:PWU, editor = "{USENIX}", key = "USENIX-WINTER'91", booktitle = "Proceedings of the Winter 1991 {USENIX} Conference: January 21--January 25, 1991, Dallas, {TX}, {USA}", title = "Proceedings of the Winter 1991 {USENIX} Conference: January 21--January 25, 1991, Dallas, {TX}, {USA}", publisher = pub-USENIX, address = pub-USENIX:adr, pages = "ix + 363", year = "1991", LCCN = "QA 76.76 O63 U84 1992", bibdate = "Mon Jul 18 12:14:50 1994", bibsource = "ftp://ftp.uu.net/library/bibliography; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Computer networks --- Congresses; Operating systems (Computers) --- Congresses; Programming (Electronic computers) --- Congresses; UNIX (Computer file) --- Congresses", } @Proceedings{Watt:1991:IPI, editor = "Stephen M. Watt", booktitle = "ISSAC '91: proceedings of the 1991 International Symposium on Symbolic and Algebraic Computation, July 15--17, 1991, Bonn, Germany", title = "{ISSAC} '91: proceedings of the 1991 International Symposium on Symbolic and Algebraic Computation, July 15--17, 1991, Bonn, Germany", publisher = pub-ACM, address = pub-ACM:adr, pages = "xiii + 468", year = "1991", ISBN = "0-89791-437-6", ISBN-13 = "978-0-89791-437-6", LCCN = "QA 76.95 I59 1991", bibdate = "Thu Sep 26 06:00:06 MDT 1996", bibsource = "https://www.math.utah.edu/pub/bibnet/authors/d/dirac-p-a-m.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The following topics were dealt with: algorithms for symbolic mathematical computation; languages, systems and packages; computational geometry, group theory and number theory; automatic theorem proving and programming; interface of symbolics, numerics and graphics; applications in mathematics, science and engineering; and symbolic and algebraic computation in education.", acknowledgement = ack-nhfb, classification = "C1160 (Combinatorial mathematics); C4130 (Interpolation and function approximation); C4210 (Formal logic); C4240 (Programming and algorithm theory); C7310 (Mathematics)", confdate = "15--17 July 1991", conflocation = "Bonn, Germany", confsponsor = "ACM", keywords = "algebra --- data processing --- congresses; Algebraic computation; Algorithms; Automatic theorem proving; Computational geometry; Education; Engineering; Graphics; Group theory; Languages; Mathematics; mathematics --- data processing --- congresses; Number theory; Programming; Science; Symbolic mathematical computation; Symbolics", pubcountry = "USA", thesaurus = "Computational complexity; Formal languages; Interpolation; Number theory; Polynomials; Symbol manipulation", } @Proceedings{ACM:1992:CPI, editor = "{ACM}", booktitle = "Conference proceedings / 1992 International Conference on Supercomputing, July 19--23, 1992, Washington, DC", title = "Conference proceedings / 1992 International Conference on Supercomputing, July 19--23, 1992, Washington, {DC}", publisher = pub-ACM, address = pub-ACM:adr, pages = "x + 485", year = "1992", ISBN = "0-89791-485-6 (paperback), 0-89791-486-4", ISBN-13 = "978-0-89791-485-7 (paperback), 978-0-89791-486-4", LCCN = "QA 76.88 I57 1992", bibdate = "Wed Aug 28 06:48:31 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; University of California MELVYL catalog.", note = "Sponsored by ACM SIGARCH.", acknowledgement = ack-nhfb, keywords = "supercomputers --- congresses", } @Proceedings{IEEE:1992:PSM, editor = "{IEEE Computer Society. Technical Committee on Computer Architecture}", booktitle = "Proceedings, Supercomputing '92: Minneapolis, Minnesota, November 16-20, 1992", title = "Proceedings, Supercomputing '92: Minneapolis, Minnesota, November 16-20, 1992", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "xxiv + 848", year = "1992", ISBN = "0-8186-2632-1 (case), 0-8186-2630-5 (paper), 0-8186-2631-3 (microfiche), 0-89791-537-2 (ACM Library series)", ISBN-13 = "978-0-8186-2632-6 (case), 978-0-8186-2630-2 (paper), 978-0-8186-2631-9 (microfiche), 978-0-89791-537-3 (ACM Library series)", LCCN = "QA76.5 .S894 1992", bibdate = "Wed Aug 28 06:48:31 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; University of California MELVYL catalog.", note = "Cover title: Supercomputing '91. ACM order number 415922. IEEE Computer Society Press order number 2630 IEEE catalog number 92CH3216-9.", acknowledgement = ack-nhfb, keywords = "artificial intelligence; biosciences; cache; compiling; distributed computing; fluids; industrial modeling; instruction-level optimization; interconnections; massively parallel systems; multiprocessing programs; multiprocessing systems; numerical applications; parallel algorithms; parallel programming; parallelizing transformations; particles; performance evaluation; performance methodology; register efficiency; scheduling; sparse matrix algorithms; supercomputers --- congresses; symbolic algorithms; waves", } @Proceedings{USENIX:1992:PSU, editor = "{USENIX}", booktitle = "Proceedings of the Summer 1992 {USENIX} Conference: June 8--12, 1992, San Antonio, Texas, USA", title = "Proceedings of the Summer 1992 {USENIX} Conference: June 8--12, 1992, San Antonio, Texas, {USA}", publisher = pub-USENIX, address = pub-USENIX:adr, pages = "vii + 253", month = "Summer", year = "1992", ISBN = "1-880446-44-8", ISBN-13 = "978-1-880446-44-7", LCCN = "QA 76.76 O63 U83 1992", bibdate = "Wed Aug 13 10:48:45 MDT 1997", bibsource = "ftp://ftp.uu.net/library/bibliography; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, annote = "Spine title: San Antonio conference proceedings.", keywords = "UNIX (Computer operating system) --- Congresses", location = "San Antonio, TX", } @Proceedings{USENIX:1992:SED, editor = "{USENIX}", booktitle = "Symposium on Experiences with Distributed and Multiprocessor Systems (SEDMS III), March 26--27, 1992. Newport Beach, CA", title = "Symposium on Experiences with Distributed and Multiprocessor Systems ({SEDMS III}), March 26--27, 1992. Newport Beach, {CA}", publisher = pub-USENIX, address = pub-USENIX:adr, pages = "326", day = "26--27", month = mar, year = "1992", ISBN = "1-880446-41-3", ISBN-13 = "978-1-880446-41-6", LCCN = "QA76.9.D3 S954 1992", bibdate = "Wed Oct 16 13:53:39 2002", bibsource = "ftp://ftp.uu.net/library/bibliography; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, location = "Newport Beach, CA", } @Proceedings{ACM:1993:CRT, editor = "{ACM}", key = "ACM SIGPLAN POPL '93", booktitle = "Conference record of the Twentieth Annual {ACM} {SIGPLAN-SIGACT} Symposium on Principles of Programming Languages: papers presented at the symposium, {Charleston, South Carolina}, {January} 10--13, 1993", title = "Conference record of the Twentieth Annual {ACM} {SIGPLAN-SIGACT} Symposium on Principles of Programming Languages: papers presented at the symposium, {Charleston, South Carolina}, {January} 10--13, 1993", publisher = pub-ACM, address = pub-ACM:adr, pages = "viii + 510", year = "1993", ISBN = "0-89791-560-7 (soft cover), 0-89791-561-5 (series hard cover)", ISBN-13 = "978-0-89791-560-1 (soft cover), 978-0-89791-561-8 (series hard cover)", LCCN = "QA76.7 .A15 1993", bibdate = "Mon May 03 18:38:48 1999", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "ACM order number 549930.", URL = "http://www.acm.org/pubs/contents/proceedings/plan/158511/index.html", acknowledgement = ack-nhfb, classification = "C4210 (Formal logic); C4240 (Programming and algorithm theory); C6110 (Systems analysis and programming); C6140D (High level languages); C6150C (Compilers, interpreters and other processors); C6170 (Expert systems)", confdate = "10-13 Jan. 1993", conflocation = "Charleston, SC, USA", confsponsor = "ACM", keywords = "Compilers; Computational complexity; electronic digital computers --- programming --- congresses; Functional programming; Lambda calculus; Lazy evaluation; Logic programming; Object-oriented languages; Parallel computing; Parametricity; Polymorphism; Program testing/debugging; Programming language principles; programming languages (electronic computers) --- congresses; Register allocation; Typed languages", thesaurus = "Computational complexity; High level languages; Lambda calculus; Program compilers; Programming; Programming theory; Storage allocation", } @Proceedings{ACM:1993:PTF, editor = "{ACM}", booktitle = "{Proceedings of the twenty-fifth annual ACM Symposium on the Theory of Computing, San Diego, California, May 16--18, 1993}", title = "{Proceedings of the twenty-fifth annual ACM Symposium on the Theory of Computing, San Diego, California, May 16--18, 1993}", publisher = pub-ACM, address = pub-ACM:adr, pages = "ix + 812", year = "1993", ISBN = "0-89791-591-7", ISBN-13 = "978-0-89791-591-5", LCCN = "QA 76.6 A13 1993", bibdate = "Thu Dec 3 07:11:18 MST 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "ACM order no. 508930.", acknowledgement = ack-nhfb, keywords = "computational complexity --- congresses", } @Proceedings{ACM:1993:TCS, editor = "ACM", booktitle = "TRI-Ada '93: Conference --- September 1993, Seattle, WA", title = "{TRI}-Ada '93: Conference --- September 1993, Seattle, {WA}", publisher = pub-ACM, address = pub-ACM:adr, pages = "vii + 482", year = "1993", ISBN = "0-89791-621-2", ISBN-13 = "978-0-89791-621-9", LCCN = "????", bibdate = "Thu Sep 04 12:56:10 1997", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "ACM Order No. 825930.", series = "TRIADA -proceedings- 1993", acknowledgement = ack-nhfb, sponsor = "Association for Computing Machinery; SIGAda.", } @Proceedings{IEEE:1993:PSP, editor = "{IEEE}", key = "Supercomputing'93", booktitle = "Proceedings, Supercomputing '93: Portland, Oregon, November 15--19, 1993", title = "Proceedings, Supercomputing '93: Portland, Oregon, November 15--19, 1993", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "xxii + 935", year = "1993", ISBN = "0-8186-4340-4 (paperback), 0-8186-4341-2 (microfiche), 0-8186-4342-0 (hardback), 0-8186-4346-3 (CD-ROM)", ISBN-13 = "978-0-8186-4340-8 (paperback), 978-0-8186-4341-5 (microfiche), 978-0-8186-4342-2 (hardback), 978-0-8186-4346-0 (CD-ROM)", ISSN = "1063-9535", LCCN = "QA76.5 .S96 1993", bibdate = "Mon Jan 15 11:06:21 1996", bibsource = "https://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classification = "631.1; 722.1; 722.3; 722.4; 723.2; 921.6", keywords = "Algorithms; Cache coherence; Clustered workstations; Computer graphics; Computer networks; Computer programming languages; Data parallel compilers; Data partitioning; Distributed computer systems; Eigenvalues and eigenfunctions; Finite element method; Flow visualization; Fluid mechanics; Linear algebra; Mass storage; Massively parallel processors; Natural sciences computing; Parallel languages; Parallel processing systems; Parallel rendering; Program compilers; Quantum theory; Scheduling; Sparse matrices; Supercomputers", sponsor = "Institute of Electrical and Electronics Engineers; Computer Society. Association for Computing Machinery; SIGARCH.", } @Proceedings{USENIX:1993:PUMb, editor = "{USENIX}", booktitle = "Proceedings of the {USENIX} Mobile and Location-Independent Computing Symposium: August 2--3, 1993, Cambridge, Massachusetts, USA", title = "Proceedings of the {USENIX} Mobile and Location-Independent Computing Symposium: August 2--3, 1993, Cambridge, Massachusetts, {USA}", publisher = pub-USENIX, address = pub-USENIX:adr, pages = "138", year = "1993", ISBN = "1-880446-51-0", ISBN-13 = "978-1-880446-51-5", LCCN = "QA 76.76 O63 U86 1993", bibdate = "Tue Oct 22 08:33:21 2002", bibsource = "ftp://ftp.uu.net/library/bibliography; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.usenix.org/publications/library/proceedings/mobile93/", acknowledgement = ack-nhfb, annote = "Spine title: Mobile and Location-Independent Computing Symposium, Summer 1993.", keywords = "Computer networks --- Congresses; Portable computers --- Communication systems --- Congresses; UNIX (Computer file) --- Congresses", } @Proceedings{USENIX:1993:PWU, editor = "{USENIX}", booktitle = "Proceedings of the Winter 1993 {USENIX} Conference: January 25--29, 1993, San Diego, California, {USA}", title = "Proceedings of the Winter 1993 {USENIX} Conference: January 25--29, 1993, San Diego, California, {USA}", publisher = pub-USENIX, address = pub-USENIX:adr, pages = "x + 530", year = "1993", ISBN = "1-880446-48-0", ISBN-13 = "978-1-880446-48-5", LCCN = "QA 76.76 O63 U84 1993", bibdate = "Sun Feb 18 07:46:09 MST 1996", bibsource = "ftp://ftp.uu.net/library/bibliography; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.usenix.org/publications/library/proceedings/sd93/", acknowledgement = ack-nhfb, annote = "Spine title: USENIX San Diego conference proceedings, winter 1993. Running title: 1993 winter USENIX, January 25--29, 1993, San Diego, CA.", keywords = "Computer networks --- Congresses; Operating systems (Computers) --- Congresses; Programming (Electronic computers) --- Congresses; UNIX (Computer file) --- Congresses", } @Proceedings{ACM:1994:ASC, editor = "{ACM}", booktitle = "{ACM SIGPLAN '94 Conference on Programming Language Design and Implementation (PLDI). Orlando, FL, USA, 20--24 June, 1994}", title = "{ACM SIGPLAN '94 Conference on Programming Language Design and Implementation (PLDI). Orlando, FL, USA, 20--24 June, 1994}", volume = "29(6)", publisher = pub-ACM, address = pub-ACM:adr, pages = "360", month = jun, year = "1994", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Fri Apr 24 18:36:02 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = j-SIGPLAN, acknowledgement = ack-nhfb, classification = "C4240 (Programming and algorithm theory); C6110 (Systems analysis and programming); C6140D (High level languages); C6150C (Compilers, interpreters and other processors); C6150G (Diagnostic, testing, debugging and evaluating systems)", conftitle = "ACM SIGPLAN '94 Conference on Programming Language Design and Implementation (PLDI)", keywords = "address calculation; array access errors; backtracking; cache performance; CLP; code replication; compilation techniques; continuation passing; garbage collected programs; high level languages; jump debugging; jump statements; lazy functional state threads; link-time optimisation; memory access coalescing; optimal tracing; optimisation; partial dead code elimination; pointer-based data structures; Presburger Formulas; program analysis tools; program compilers; program debugging; program optimisation; program structure tree; programming; programming language design; programming theory; programming theory program debugging; Prolog; register allocation; slicing programs; Standard ML; type analysis; zero-cost range splitting", sponsororg = "ACM", treatment = "P Practical; T Theoretical or Mathematical", } @Proceedings{ACM:1994:CRP, editor = "{ACM}", booktitle = "Conference record of {POPL} '94, 21st {ACM SIGPLAN-SIGACT} Symposium on Principles of Programming Languages: papers presented at the Symposium: Portland, Oregon, January 17--21, 1994", title = "Conference record of {POPL} '94, 21st {ACM SIGPLAN-SIGACT} Symposium on Principles of Programming Languages: papers presented at the Symposium: Portland, Oregon, January 17--21, 1994", publisher = pub-ACM, address = pub-ACM:adr, pages = "viii + 492", year = "1994", ISBN = "0-89791-636-0", ISBN-13 = "978-0-89791-636-3", LCCN = "QA76.7 .A15 1994", bibdate = "Sat Sep 7 07:51:54 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org/pubs/contents/proceedings/plan/174675/index.html", abstract = "The following topics were dealt with: programming language principles; OOP; type theory; program correctness; lambda calculus; garbage collection; logic programming; scheduling; data flow graphs; functional programming; and continuation passing.", acknowledgement = ack-nhfb, classification = "C4210 (Formal logic); C4240 (Programming and algorithm theory); C6110J (Object-oriented programming); C6120 (File organisation); C6140D (High level languages); C6150C (Compilers, interpreters and other processors)", confdate = "17--21 Jan. 1994", conflocation = "Portland, OR, USA", confsponsor = "ACM", keywords = "Continuation passing; Data flow graphs; Functional programming; Garbage collection; Lambda calculus; Logic programming; OOP; Program correctness; Programming language principles; Scheduling; Type theory", thesaurus = "High level languages; Lambda calculus; Object-oriented programming; Program compilers; Program verification; Storage management; Type theory", } @Proceedings{ACM:1994:IPI, editor = "{ACM}", booktitle = "{ISSAC '94: Proceedings of the 1994 International Symposium on Symbolic and Algebraic Computation: July 20--22, 1994, Oxford, England, United Kingdom}", title = "{ISSAC '94: Proceedings of the 1994 International Symposium on Symbolic and Algebraic Computation: July 20--22, 1994, Oxford, England, United Kingdom}", publisher = pub-ACM, address = pub-ACM:adr, pages = "ix + 359", year = "1994", ISBN = "0-89791-638-7", ISBN-13 = "978-0-89791-638-7", LCCN = "QA76.95.I59 1994", bibdate = "Thu Sep 26 05:45:15 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, confdate = "20--22 July 1994", conflocation = "Oxford, UK", confsponsor = "ACM", pubcountry = "USA", } @Proceedings{ACM:1994:SIC, editor = "ACM", booktitle = "{Sixth International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS-VI). San Jose, CA, USA, 4--7 October, 1994}", title = "{Sixth International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS-VI). San Jose, CA, USA, 4--7 October, 1994}", volume = "29(11)", publisher = pub-ACM, address = pub-ACM:adr, pages = "328", month = nov, year = "1994", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Fri Apr 24 18:36:02 MDT 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = j-SIGPLAN, acknowledgement = ack-nhfb, classification = "C5220 (Computer architecture); C6140 (Programming languages); C6150J (Operating systems)", conflocation = "", conftitle = "Sixth International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS-VI)", keywords = "architectural support; code transformation; computer architecture; instrumentation; measurement; memory access; multithreading; operating systems; operating systems (computers); parallel machines; programming languages; shares memory multiprocessors; uniprocessor performance", sponsororg = "ACM; IEEE Comput. Soc", } @Proceedings{Anonymous:1994:ICS, editor = "Anonymous", booktitle = "1994 International Computer Symposium Conference Proceedings", title = "1994 International Computer Symposium Conference Proceedings", publisher = "Nat. Chiao Tung Univ", address = "Hsinchu, Taiwan", pages = "xvi + 1310", year = "1994", ISBN = "????", ISBN-13 = "????", LCCN = "????", bibdate = "Sun Dec 22 10:19:23 MST 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "2 vol.", acknowledgement = ack-nhfb, confdate = "12--15 Dec. 1994", conflocation = "Hsinchu, Taiwan", confsponsor = "Ministr. Educ.; Comput. Soc", pubcountry = "Taiwan", } @Proceedings{Anonymous:1994:PIW, editor = "Anonymous", booktitle = "Proceedings of the 2nd International World Wide Web conference, Mosaic and the Web, October 1994, Ramada-Congress Hotel, 520 South Michigan Avenue, Chicago, IL", title = "Proceedings of the 2nd International World Wide Web conference, Mosaic and the Web, October 1994, Ramada-Congress Hotel, 520 South Michigan Avenue, Chicago, {IL}", volume = "18(6)", publisher = pub-LEARNED-INF, address = pub-LEARNED-INF:adr, pages = "????", year = "1994", CODEN = "ONCDEW", ISSN = "0309-314X", bibdate = "Sun Oct 22 08:43:14 2000", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = j-ONLINE-CDROM-REV, URL = "http://www.ncsa.uiuc.edu/SDG/IT94/Proceedings/WWW2_Proceedings.html", acknowledgement = ack-nhfb, } @Proceedings{Anonymous:1994:USC, editor = "Anonymous", booktitle = "USENIX Summer conference: --- June 1994, Boston, MA", title = "{USENIX} Summer conference: -- June 1994, Boston, {MA}", publisher = pub-USENIX, address = pub-USENIX:adr, pages = "316", year = "1994", ISBN = "1-880446-62-6", ISBN-13 = "978-1-880446-62-1", LCCN = "QA 76.76 O63 U83 1994", bibdate = "Sat May 25 07:59:58 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = "USENIX Conference Proceedings 1994", acknowledgement = ack-nhfb, } @Proceedings{Goldwasser:1994:PAS, editor = "Shafi Goldwasser", booktitle = "Proceedings: 35th Annual Symposium on Foundations of Computer Science, November 20--22, 1994, Santa Fe, New Mexico", title = "Proceedings: 35th Annual Symposium on Foundations of Computer Science, November 20--22, 1994, Santa Fe, New Mexico", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "xiii + 837", year = "1994", CODEN = "ASFPDV", ISBN = "0-8186-6582-3", ISBN-13 = "978-0-8186-6582-0", ISSN = "0272-5428", LCCN = "QA 76 S979 1994", bibdate = "Thu Dec 3 07:11:18 MST 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "IEEE catalog number 94CH35717. IEEE Computer Society Press Order Number 6580-02.", acknowledgement = ack-nhfb, keywords = "electronic data processing --- congresses", } @Proceedings{Hong:1994:FIS, editor = "Hoon Hong", booktitle = "{First International Symposium on Parallel Symbolic Computation, PASCO '94, Hagenberg\slash Linz, Austria, September 26--28, 1994}", title = "{First International Symposium on Parallel Symbolic Computation, PASCO '94, Hagenberg\slash Linz, Austria, September 26--28, 1994}", volume = "5", publisher = pub-WORLD-SCI, address = pub-WORLD-SCI:adr, pages = "xiii + 431", year = "1994", ISBN = "981-02-2040-5", ISBN-13 = "978-981-02-2040-2", LCCN = "QA76.642.I58 1994", bibdate = "Thu Mar 12 07:55:38 MST 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/issac.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = "Lecture notes series in computing", acknowledgement = ack-nhfb, alttitle = "Parallel symbolic computation", keywords = "Parallel programming (Computer science) --- Congresses.", } @Proceedings{IEEE:1994:PIW, editor = "{IEEE}", booktitle = "Proceedings 11th IEEE Workshop on Real-Time Operating Systems and Software. RTOSS '94, Seattle, WA, USA, 18--19 May 1994", title = "Proceedings 11th {IEEE} Workshop on Real-Time Operating Systems and Software. {RTOSS} '94, Seattle, {WA}, {USA}, 18--19 May 1994", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "viii + 117", year = "1994", ISBN = "0-8186-5710-3", ISBN-13 = "978-0-8186-5710-8", LCCN = "QA76.54.I173 1994", bibdate = "Sat Sep 28 18:52:45 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/mach.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "IEEE catalog number 94TH0639-5.", acknowledgement = ack-nhfb, confsponsor = "IEEE", } @Proceedings{IEEE:1994:PSH, editor = "{IEEE}", booktitle = "{Proceedings of the Scalable High-Performance Computing Conference, May 23--25, 1994, Knoxville, Tennessee}", title = "{Proceedings of the Scalable High-Performance Computing Conference, May 23--25, 1994, Knoxville, Tennessee}", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "xviii + 852", year = "1994", ISBN = "0-8186-5680-8, 0-8186-5681-6", ISBN-13 = "978-0-8186-5680-4, 978-0-8186-5681-1", LCCN = "QA76.5 .S244 1994", bibdate = "Mon Aug 26 10:38:41 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "IEEE catalog number 94TH0637-9.", acknowledgement = ack-nhfb, sponsor = "IEEE Computer Society; Technical Committee on Supercomputing Applications.", } @Proceedings{IEEE:1994:PSW, editor = "{IEEE}", booktitle = "{Proceedings, Supercomputing '94: Washington, DC, November 14--18, 1994}", title = "{Proceedings, Supercomputing '94: Washington, DC, November 14--18, 1994}", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "xvii + 823", year = "1994", ISBN = "0-8186-6605-6 (paper), 0-8186-6606-4 (microfiche), 0-8186-6607-2 (case)", ISBN-13 = "978-0-8186-6605-6 (paper), 978-0-8186-6606-3 (microfiche), 978-0-8186-6607-0 (case)", ISSN = "1063-9535", LCCN = "QA76.5 .S894 1994", bibdate = "Fri Aug 30 08:01:51 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "IEEE catalog number 94CH34819.", series = "Supercomputing", acknowledgement = ack-nhfb, keywords = "Supercomputers --- Congresses", sponsor = "IEEE.", } @Proceedings{IEEE:1994:ROS, editor = "IEEE", booktitle = "Real-time operating systems and software: RTOSS '94: 11th Workshop --- May 1994, Seattle, WA", title = "Real-time operating systems and software: {RTOSS} '94: 11th Workshop --- May 1994, Seattle, {WA}", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "viii + 117", year = "1994", ISBN = "0-8186-5710-3", ISBN-13 = "978-0-8186-5710-8", LCCN = "QA76.54.I173 1994", bibdate = "Sat May 25 07:59:58 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = "IEEE Workshop on Real Time Operating Systems and Software 1994; 11th", acknowledgement = ack-nhfb, sponsor = "IEEE; Computer Society; Technical Committee on Real-Time Systems.", } @Proceedings{ACM:1995:CPI, editor = "ACM", booktitle = "Conference proceedings of the 1995 International Conference on Supercomputing, Barcelona, Spain, July 3--7, 1995", title = "Conference proceedings of the 1995 International Conference on Supercomputing, Barcelona, Spain, July 3--7, 1995", publisher = pub-ACM, address = pub-ACM:adr, pages = "xii + 448", year = "1995", ISBN = "0-89791-728-6", ISBN-13 = "978-0-89791-728-5", LCCN = "QA 76.88 I57 1995", bibdate = "Mon Dec 23 18:50:57 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = "Conference Proceedings of the International Conference on Supercomputing", acknowledgement = ack-nhfb, sponsor = "Association for Computing Machinery. Special Interest Group on Computer Architecture.", } @Proceedings{ACM:1995:CRP, editor = "{ACM}", booktitle = "Conference record of {POPL} '95, 22nd {ACM} {SIGPLAN-SIGACT} Symposium on Principles of Programming Languages: papers presented at the Symposium: San Francisco, California, January 22--25, 1995", title = "Conference record of {POPL} '95, 22nd {ACM} {SIGPLAN-SIGACT} Symposium on Principles of Programming Languages: papers presented at the Symposium: San Francisco, California, January 22--25, 1995", publisher = pub-ACM, address = pub-ACM:adr, pages = "vii + 408", year = "1995", ISBN = "0-89791-692-1", ISBN-13 = "978-0-89791-692-9", LCCN = "QA 76.7 A11 1995", bibdate = "Mon May 3 17:47:49 MDT 1999", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "ACM order number: 549950.", URL = "http://www.acm.org/pubs/contents/proceedings/plan/199448/index.html", acknowledgement = ack-nhfb, alttitle = "Proceedings, 22nd ACM SIGPLAN-SIGACT Symposium on Principles of Programming Languages POPL '95", annote = "Sponsored by the Association for Computing Machinery, Special Interest Group on Algorithms and Computation Theory (SIGACT), Special Interest Group on Programming Languages (SIGPLAN).", keywords = "Programming languages (Electronic computers) -- Congresses.", } @Proceedings{Ferreira:1995:PAI, editor = "Afonso Ferreira and Jose Rolim", booktitle = "{Parallel algorithms for irregularly structured problems: second international workshop, IRREGULAR 95, Lyon, France, September, 4--6, 1995: proceedings}", title = "{Parallel algorithms for irregularly structured problems: second international workshop, IRREGULAR 95, Lyon, France, September, 4--6, 1995: proceedings}", publisher = pub-SV, address = pub-SV:adr, pages = "x + 409", year = "1995", ISBN = "3-540-60321-2", ISBN-13 = "978-3-540-60321-4", LCCN = "QA76.642.I59 1995", bibdate = "Sun Dec 22 10:19:23 MST 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/pvm.bib", acknowledgement = ack-nhfb, confsponsor = "IFIP", pubcountry = "Germany", } @Proceedings{IEEE:1995:PCL, editor = "{IEEE Computer Society. Technical Committee on Computer Communications}", booktitle = "Proceedings: 20th Conference on Local Computer Networks, October 16--19, 1995, Minneapolis, Minnesota", title = "Proceedings: 20th Conference on Local Computer Networks, October 16--19, 1995, Minneapolis, Minnesota", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "xii + 496", year = "1995", ISBN = "0-8186-7163-7 (microfiche), 0-8186-7162-9", ISBN-13 = "978-0-8186-7163-0 (microfiche), 978-0-8186-7162-3", LCCN = "TK5105.7 .C66 1995 Bar", bibdate = "Mon Sep 27 06:55:07 MDT 1999", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "IEEE Computer Society Press order number PR07162. IEEE catalog number 95TB100005", acknowledgement = ack-nhfb, keywords = "local area networks (computer networks) -- congresses", } @Proceedings{ACM:1996:FCP, editor = "{ACM}", booktitle = "FCRC '96: Conference proceedings of the 1996 International Conference on Supercomputing: Philadelphia, Pennsylvania, {USA}, May 25--28, 1996", title = "{FCRC} '96: Conference proceedings of the 1996 International Conference on Supercomputing: Philadelphia, Pennsylvania, {USA}, May 25--28, 1996", publisher = pub-ACM, address = pub-ACM:adr, pages = "xii + 406", year = "1996", ISBN = "0-89791-803-7", ISBN-13 = "978-0-89791-803-9", LCCN = "QA76.5 I61 1996", bibdate = "Wed Mar 18 12:33:29 MST 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "ACM order number 415961.", acknowledgement = ack-nhfb, keywords = "Supercomputers --- Congresses.", } @Proceedings{IEEE:1996:PSM, editor = "{IEEE}", booktitle = "Proceedings. Second MPI Developer's Conference: Notre Dame, IN, USA, 1--2 July 1996", title = "Proceedings. Second {MPI} Developer's Conference: Notre Dame, {IN}, {USA}, 1--2 July 1996", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "ix + 207", year = "1996", ISBN = "0-8186-7533-0", ISBN-13 = "978-0-8186-7533-1", LCCN = "QA76.642 .M67 1996", bibdate = "Tue May 12 08:56:04 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, sponsororg = "IEEE Comput. Soc. Tech. Committee on Distributed Process", } @Proceedings{LakshmanYN:1996:IPI, editor = "{Lakshman Y. N.}", booktitle = "{ISSAC '96: Proceedings of the 1996 International Symposium on Symbolic and Algebraic Computation, July 24--26, 1996, Zurich, Switzerland}", title = "{ISSAC '96: Proceedings of the 1996 International Symposium on Symbolic and Algebraic Computation, July 24--26, 1996, Zurich, Switzerland}", publisher = pub-ACM, address = pub-ACM:adr, pages = "xvii + 313", year = "1996", ISBN = "0-89791-796-0", ISBN-13 = "978-0-89791-796-4", LCCN = "QA 76.95 I59 1996", bibdate = "Thu Mar 12 08:00:14 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/issac.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, sponsor = "ACM; Special Interest Group in Symbolic and Algebraic Manipulation (SIGSAM). ACM; Special Interest Group on Numerical Mathematics (SIGNUM).", } @Proceedings{Szymanski:1996:LCR, editor = "Boleslaw K. Szymanski and Balaram Sinharoy", booktitle = "Languages, Compilers and Run-Time Systems for Scalable Computers, Troy, NY, USA, May 22--24, 1995", title = "Languages, Compilers and Run-Time Systems for Scalable Computers, Troy, {NY}, {USA}, May 22--24, 1995", publisher = pub-KLUWER, address = pub-KLUWER:adr, pages = "xiv + 335", year = "1996", ISBN = "0-7923-9635-9", ISBN-13 = "978-0-7923-9635-2", LCCN = "QA76.58.L37 1996", bibdate = "Sat Sep 28 18:12:58 MDT 1996", bibsource = "https://www.math.utah.edu/pub/tex/bib/mach.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Proceedings{USENIX:1996:ATT, editor = "{USENIX} Association", booktitle = "4th Annual Tcl/Tk Workshop '96, July 10--13, 1996. Monterey, CA", title = "4th Annual Tcl/Tk Workshop '96, July 10--13, 1996. Monterey, {CA}", publisher = pub-USENIX, address = pub-USENIX:adr, pages = "????", day = "10--13", month = jul, year = "1996", ISBN = "1-880446-78-2", ISBN-13 = "978-1-880446-78-2", LCCN = "QA76.73.T44 T44 1996", bibdate = "Fri Oct 18 07:24:24 MDT 1996", bibsource = "ftp://ftp.uu.net/library/bibliography; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, location = "Monterey, CA", } @Proceedings{USENIX:1996:PFA, editor = "{USENIX}", booktitle = "Proceedings of the fourth annual Tcl\slash Tk Workshop, July 10--13, 1996, Monterey, California", title = "Proceedings of the fourth annual Tcl\slash Tk Workshop, July 10--13, 1996, Monterey, California", publisher = pub-USENIX, address = pub-USENIX:adr, pages = "235", year = "1996", ISBN = "1-880446-78-2", ISBN-13 = "978-1-880446-78-2", LCCN = "QA 76.73 T44 T35 1996", bibdate = "Mon May 11 11:50:25 1998", bibsource = "ftp://ftp.uu.net/library/bibliography; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.usenix.org/publications/library/proceedings/tcl96/", acknowledgement = ack-nhfb, location = "Monterey, CA", } @Proceedings{IEEE:1997:APD, editor = "{IEEE}", booktitle = "Advances in parallel and distributed computing: March 19--21, 1997, Shanghai, China: proceedings", title = "Advances in parallel and distributed computing: March 19--21, 1997, Shanghai, China: proceedings", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "xii + 426", year = "1997", ISBN = "0-8186-7876-3 (paperback and case), 0-8186-7878-X (microfiche)", ISBN-13 = "978-0-8186-7876-9 (paperback and case), 978-0-8186-7878-3 (microfiche)", LCCN = "QA76.58 .A4 1997", bibdate = "Wed Apr 16 07:34:31 MDT 1997", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "electronic data processing -- distributed processing -- congresses; parallel processing (electronic computers) -- congresses", } @Proceedings{ACM:1998:AWJ, editor = "{ACM}", booktitle = "ACM 1998 Workshop on Java for High-Performance Network Computing", title = "{ACM} 1998 Workshop on Java for High-Performance Network Computing", publisher = pub-ACM, address = pub-ACM:adr, pages = "????", year = "1998", ISBN = "????", ISBN-13 = "????", LCCN = "????", bibdate = "Thu Apr 27 10:40:59 2000", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "Possibly unpublished, except electronically.", URL = "http://www.cs.ucsb.edu/conferences/java98/program.html", acknowledgement = ack-nhfb, } @Proceedings{ACM:1998:CRP, editor = "ACM", booktitle = "Conference record of POPL '98: the 25th ACM SIGPLAN-SIGACT Symposium on Principles of Programming Languages: papers presented at the Symposium, San Diego, California, 19--21 January 1998", title = "Conference record of {POPL} '98: the 25th {ACM} {SIGPLAN-SIGACT} Symposium on Principles of Programming Languages: papers presented at the Symposium, San Diego, California, 19--21 January 1998", publisher = pub-ACM, address = pub-ACM:adr, pages = "viii + 408", year = "1998", ISBN = "0-89791-979-3", ISBN-13 = "978-0-89791-979-1", LCCN = "QA76.7 .A15 1998", bibdate = "Mon May 3 17:47:49 MDT 1999", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "ACM order number: 549981.", URL = "http://www.acm.org/pubs/contents/proceedings/plan/268946/index.html", acknowledgement = ack-nhfb, alttitle = "POPL '98 ACM SIGPLAN-SIGACT Symposium on Principles of Programming Languages Principles of programming languages Proceedings 25th ACM SIGPLAN-SIGACT Symposium on Principles of Programming Languages", keywords = "Electronic digital computers -- Programming -- Congresses.; Programming languages (Electronic computers) -- Congresses.", } @Proceedings{ACM:1998:PAI, editor = "{ACM}", booktitle = "{Proceedings: the 25th Annual International Symposium on Computer Architecture, June 27--July 1, 1998, Barcelona, Spain}", title = "{Proceedings: the 25th Annual International Symposium on Computer Architecture, June 27--July 1, 1998, Barcelona, Spain}", volume = "26(3)", publisher = pub-ACM, address = pub-ACM:adr, pages = "xiii + 394", year = "1998", ISBN = "0-8186-8491-7, 0-8186-8492-5, 0-8186-8493-3", ISBN-13 = "978-0-8186-8491-3, 978-0-8186-8492-0, 978-0-8186-8493-7", LCCN = "QA76.9.A73 S97 1998", bibdate = "Fri May 12 12:36:10 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; z3950.bibsys.no:2100/BIBSYS", note = "ACM Order Number 414984. IEEE Computer Society Order Number PR08491; IEEE Order Plan Catalog Number 98CB36235.", series = "Computer architecture news", URL = "http://portal.acm.org/toc.cfm?id=279358; http://portal.acm.org/toc.cfm?id=285930", acknowledgement = ack-nhfb, remark = "ISCA '25 proceedings.", } @Proceedings{ACM:1998:SHP, editor = "{ACM}", booktitle = "SC'98: High Performance Networking and Computing: Proceedings of the 1998 ACM\slash IEEE SC98 Conference: Orange County Convention Center, Orlando, Florida, USA, November 7--13, 1998", title = "{SC}'98: High Performance Networking and Computing: Proceedings of the 1998 {ACM}\slash {IEEE} {SC98} Conference: Orange County Convention Center, Orlando, Florida, {USA}, November 7--13, 1998", publisher = pub-ACM # " and " # pub-IEEE, address = pub-ACM:adr # " and " # pub-IEEE:adr, pages = "????", year = "1998", ISBN = "????", ISBN-13 = "????", LCCN = "????", bibdate = "Wed Oct 07 08:51:34 1998", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.supercomp.org/sc98/papers/", acknowledgement = ack-nhfb, } @Proceedings{USENIX:1998:PSA, editor = "{USENIX}", booktitle = "Proceedings of the sixth annual Tcl/Tk Conference, September 18--24 [i.e. 14--18], 1998, San Diego, California", title = "Proceedings of the sixth annual Tcl/Tk Conference, September 18--24 [i.e. 14--18], 1998, San Diego, California", publisher = pub-USENIX, address = pub-USENIX:adr, pages = "206", year = "1998", ISBN = "1-880446-98-7", ISBN-13 = "978-1-880446-98-0", LCCN = "QA76.73.T44 T34 1998", bibdate = "Fri Oct 18 08:12:11 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://db.usenix.org/publications/library/proceedings/tcl98/", acknowledgement = ack-nhfb, } @Proceedings{USENIX:1998:PUWa, editor = "{USENIX}", booktitle = "Proceedings of the 2nd {USENIX Windows NT} Symposium: August 3--5, 1998, Seattle, Washington", title = "Proceedings of the 2nd {USENIX Windows NT} Symposium: August 3--5, 1998, Seattle, Washington", publisher = pub-USENIX, address = pub-USENIX:adr, pages = "173", year = "1998", ISBN = "1-880446-95-2", ISBN-13 = "978-1-880446-95-9", LCCN = "QA76.76.O63 U885 1998", bibdate = "Fri Oct 29 08:40:21 1999", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://db.usenix.org/publications/library/proceedings/usenix-nt98", acknowledgement = ack-nhfb, } @Proceedings{ACM:1999:PASa, editor = "ACM", booktitle = "Proceedings of the ACM SIGPLAN '99 Conference on Programming Language Design and Implementation (PLDI '99), Atlanta, Georgia, 2--4 May 1999", title = "Proceedings of the {ACM} {SIGPLAN} '99 Conference on Programming Language Design and Implementation ({PLDI} '99), Atlanta, Georgia, 2--4 May 1999", publisher = pub-ACM, address = pub-ACM:adr, pages = "????", year = "1999", ISBN = "????", ISBN-13 = "????", LCCN = "????", bibdate = "Thu May 13 14:45:29 1999", bibsource = "http://www.acm.org/pubs/contents/proceedings/pldi/301122/index.html; http://www.acm.org/pubs/contents/proceedings/pldi/301618/index.html; http://www.cs.rutgers.edu/pldi99/program.html; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Proceedings{ACM:1999:SPO, editor = "{ACM}", booktitle = "SC'99: Oregon Convention Center 777 NE Martin Luther King Jr. Boulevard, Portland, Oregon, November 11--18, 1999", title = "{SC}'99: Oregon Convention Center 777 {NE} Martin Luther King Jr. Boulevard, Portland, Oregon, November 11--18, 1999", publisher = pub-ACM # " and " # pub-IEEE, address = pub-ACM:adr # " and " # pub-IEEE:adr, pages = "????", year = "1999", ISBN = "", ISBN-13 = "", LCCN = "", bibdate = "Thu Feb 24 09:35:00 2000", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Proceedings{Atkinson:1999:PTF, editor = "Malcolm P. Atkinson and Maria E. Orlowska and Patrick Valduriez and Stanley B. Zdonik and Michael L. Brodie", booktitle = "Proceedings of the Twenty-fifth International Conference on Very Large Databases, Edinburgh, Scotland, UK, 7--10 September, 1999", title = "Proceedings of the Twenty-fifth International Conference on Very Large Databases, Edinburgh, Scotland, {UK}, 7--10 September, 1999", publisher = pub-MORGAN-KAUFMANN, address = pub-MORGAN-KAUFMANN:adr, pages = "xviii + 761", year = "1999", ISBN = "1-55860-615-7", ISBN-13 = "978-1-55860-615-9", LCCN = "QA76.9.D3 I559 1999", bibdate = "Tue Oct 24 18:36:50 MDT 2000", bibsource = "DBLP; http://dblp.uni-trier.de; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/vldb.bib; OCLC Proceedings database", note = "Also known as VLDB'99", acknowledgement = ack-nhfb, keywords = "very large data bases; VLDB", } @Proceedings{Dongarra:1999:RAP, editor = "J. J. Dongarra and E. Luque and Tomas Margalef", booktitle = "{Recent advances in parallel virtual machine and message passing interface: 6th European PVM\slash {MPI} Users' Group Meeting, Barcelona, Spain, September 26--29, 1999: Proceedings}", title = "{Recent advances in parallel virtual machine and message passing interface: 6th European PVM\slash {MPI} Users' Group Meeting, Barcelona, Spain, September 26--29, 1999: Proceedings}", volume = "1697", publisher = pub-SV, address = pub-SV:adr, pages = "xvii + 551", year = "1999", CODEN = "LNCSD9", DOI = "????", ISBN = "3-540-66549-8 (softcover)", ISBN-13 = "978-3-540-66549-6 (softcover)", ISSN = "0302-9743 (print), 1611-3349 (electronic)", LCCN = "QA76.58 E973 1999", bibdate = "Wed Dec 8 06:34:56 MST 1999", bibsource = "https://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = ser-LNCS, URL = "http://link.springer-ny.com/link/service/series/0558/tocs/t1697.htm; http://www.springerlink.com/openurl.asp?genre=issue&issn=0302-9743&volume=1697", acknowledgement = ack-nhfb, alttitle = "PVM\slash MPI '99", keywords = "Data transmission systems; Parallel computers; Virtual computer systems", } @Proceedings{IEEE:1999:HCS, editor = "IEEE", booktitle = "Hot Chips 11: Stanford University, Stanford, California, August 15--17, 1999", title = "Hot Chips 11: Stanford University, Stanford, California, August 15--17, 1999", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "????", year = "1999", ISBN = "????", ISBN-13 = "????", LCCN = "????", bibdate = "Mon Jan 08 05:26:43 2001", bibsource = "https://www.math.utah.edu/pub/tex/bib/hot-chips.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.hotchips.org/hotc11_index.html", acknowledgement = ack-nhfb, } @Proceedings{ACM:2000:SHP, editor = "{ACM}", booktitle = "SC2000: High Performance Networking and Computing. Dallas Convention Center, Dallas, TX, USA, November 4--10, 2000", title = "{SC2000}: High Performance Networking and Computing. Dallas Convention Center, Dallas, {TX}, {USA}, November 4--10, 2000", publisher = pub-ACM # " and " # pub-IEEE, address = pub-ACM:adr # " and " # pub-IEEE:adr, pages = "????", year = "2000", ISBN = "", ISBN-13 = "", LCCN = "", bibdate = "Thu Feb 24 09:35:00 2000", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.sc2000.org/proceedings/info/fp.pdf", acknowledgement = ack-nhfb, } @Proceedings{Anonymous:2000:CCI, editor = "Anonymous", booktitle = "Cool Chips III: An International Symposium on Low-Power and High-Speed Chips, Kikai-Shinko-Kaikan, Tokyo, Japan April 24--25, 2000", title = "Cool Chips {III}: An International Symposium on Low-Power and High-Speed Chips, Kikai-Shinko-Kaikan, Tokyo, Japan April 24--25, 2000", publisher = "????", address = "????", pages = "????", year = "2000", ISBN = "", ISBN-13 = "", LCCN = "", bibdate = "Mon Jan 08 09:19:21 2001", bibsource = "http://www.coolchips.org/index-cool3.html; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Book{Koniges:2000:ISP, editor = "Alice E. Koniges", booktitle = "Industrial Strength Parallel Computing", title = "Industrial Strength Parallel Computing", publisher = pub-MORGAN-KAUFMANN, address = pub-MORGAN-KAUFMANN:adr, pages = "xxv + 597", year = "2000", ISBN = "1-55860-540-1", ISBN-13 = "978-1-55860-540-4", LCCN = "QA76.58 .I483 2000", bibdate = "Fri Feb 04 18:30:40 2000", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Proceedings{USENIX:2000:PUT, editor = "{USENIX}", booktitle = "Proceedings of the 7th USENIX Tcl\slash Tk Conference (Tcl/2k): February 14--18, 2000, Austin, Texas, USA", title = "Proceedings of the 7th {USENIX} Tcl\slash Tk Conference (Tcl/2k): February 14--18, 2000, Austin, Texas, {USA}", publisher = pub-USENIX, address = pub-USENIX:adr, pages = "194", year = "2000", ISBN = "1-880446-24-3", ISBN-13 = "978-1-880446-24-9", LCCN = "????", bibdate = "Wed Oct 16 09:54:12 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/usenix2000.bib", URL = "http://db.usenix.org/publications/library/proceedings/tcl2k/", acknowledgement = ack-nhfb, } @Proceedings{USENIX:2000:UAT, editor = "{USENIX}", booktitle = "2000 USENIX Annual Technical Conference: San Diego, CA, USA, June 18--23, 2000", title = "2000 {USENIX} Annual Technical Conference: San Diego, {CA}, {USA}, June 18--23, 2000", publisher = pub-USENIX, address = pub-USENIX:adr, pages = "350", year = "2000", ISBN = "1-880446-22-7", ISBN-13 = "978-1-880446-22-5", LCCN = "????", bibdate = "Mon Oct 14 07:43:52 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/usenix2000.bib", URL = "http://www.usenix.org/publications/library/proceedings/usenix2000", acknowledgement = ack-nhfb, } @Proceedings{ACM:2001:PAJ, editor = "{ACM}", booktitle = "Proceedings of the {ACM 2001 Java Grande\slash ISCOPE Conference: Palo Alto, Calif., June 2--4, 2001}", title = "Proceedings of the {ACM 2001 Java Grande\slash ISCOPE Conference: Palo Alto, Calif., June 2--4, 2001}", publisher = pub-ACM, address = pub-ACM:adr, pages = "vi + 186", year = "2001", ISBN = "1-58113-359-6", ISBN-13 = "978-1-58113-359-2", LCCN = "QA76.9.O35 A26 2001", bibdate = "Mon May 6 06:26:30 MDT 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Java (computer program language) -- congresses; object-oriented methods (computer science) -- congresses", } @Proceedings{Boisvert:2001:ASS, editor = "Ronald F. Boisvert and Ping Tak Peter Tang", booktitle = "The architecture of scientific software: {IFIP TC2/WG2.5 Working Conference on the Architecture of Scientific Software, October 2--4, 2000, Ottawa, Canada}", title = "The architecture of scientific software: {IFIP TC2/WG2.5 Working Conference on the Architecture of Scientific Software, October 2--4, 2000, Ottawa, Canada}", volume = "60", publisher = pub-KLUWER, address = pub-KLUWER:adr, pages = "xx + 358", year = "2001", ISBN = "0-7923-7339-1", ISBN-13 = "978-0-7923-7339-1", LCCN = "QA76.758 .I345 2000", bibdate = "Fri May 27 08:46:38 2005", bibsource = "https://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = "IFIP", acknowledgement = ack-nhfb, tableofcontents = "Preface (p. ix)\\ Contributing Authors (p. xv)\\ Part I: Large-Scale Systems Integration\\ Network-Based Scientific Computing: Elias N. Houstis, Ann Christine Catlin, Ganesh Balakrishnan, Nitesh Dhanjani, GaHyun Park, John R. Rice, Spyros Lalis, Manolis Stamatogiannakis, Catherine E. Houstis (pp. 3--28) \\ Future Generations of Problem-Solving Environments: Jos{\'e} C. Cunha (pp. 29--38) \\ Developing an Architecture to Support the Implementation and Development of Scientific computing Applications: Dorian C. Arnold, Jack Dongarra (pp. 39--56) \\ PETSc and Overture: Lessons Learned Developing an Interface between Components: Kristopher R. Buschelman, William Gropp, Lois C. McInnes, Barry F. Smith (pp. 57--68) \\ Component Technology for High-Performance Scientific Simulation Software: Tom Epperly, Scott R. Kohn, Gary Kumfert (pp. 69--86) \\ A New Approach to Software Integration Frameworks for Multi-physics Simulation Codes: Eric de Sturler, Jay Hoeflinger, Laxmikant V. Kal{\'e}, Milind Bhandarkar (pp. 87--104) \\ Code Coupling using Parallel CORBA Objects: Christophe Ren{\'e}, Thierry Priol, Guillaume All{\'e}on (pp. 105--118) \\ A Collaborative Code Development Environment for Computational Electro-magnetics: Matthew S. Shields, Omer F. Rana, David W. Walker, David Colby (pp. 119--144) \\ Part II: The Architecture of Components\\ On the Role of Mathematical Abstractions for Scientific Computing: Krister {\AA}hlander, Magne Haveraaen, Hans Z. Munthe-Kaas (pp. 145--158) \\ Object-oriented Modeling of Parallel PDE Solvers: Michael Thun{\'e}, Krister {\AA}hlander, Malin Ljungberg, Markus Nord{\'e}n, Kurt Otto, Jarmo Rantakokko (pp. 159--174) \\ Broadway: A Software Architecture for Scientific Computing: Samuel Z. Guyer, Calvin Lin (pp. 175--192) \\ Formal Methods for High-Performance Linear Algebra Libraries: John A. Gunnels, Robert A. van de Geijn (pp. 193--210) \\ New Generalized Matrix Data Structures Lead to a Variety of High-Performance Algorithms: Fred G. Gustavson (pp. 211--234) \\ A Comprehensive DFT API for Scientific Computing: Ping Tak Peter Tang (pp. 235--256) \\ Using A Fortran Interface to POSIX Threads: Richard J. Hanson, Clay P. Breshears, Henry A. Gabb (pp. 257--272) \\ Data Management Systems for Scientific Applications: Reagan Moore (pp. 273--284) \\ Software Components for Application Development: Arnaud Desitter, Antoine Le Hyaric, Geoff Morgan, Gareth Shaw, Anne E. Trefethen (pp. 285--300) \\ Hierarchical Representation and Computation of Approximate Solutions in Scientific Simulations: Wayne H. Enright (pp. 301--316) \\ Software Architecture for the Investigation of Controllable Models with Complex Data Sets: Dmitry Belyshev, Vladimir I. Gurman (pp. 317--332) \\ A Mixed-Language Programming Methodology for High Performance Java Computing: Vladimir Getov (pp. 333--350) \\ Part III: Conference Information\\ The Architecture of Scientific Software: the Conference (pp. 351--356)\\ Index (pp. 357--358)", } @Proceedings{Eigenmann:2001:OSM, editor = "Rudolf Eigenmann and Michael J. Voss", booktitle = "{OpenMP} shared memory parallel programming: International Workshop on {OpenMP} Applications and Tools, {WOMPAT} 2001, West Lafayette, {IN}, {USA}, July 30--31, 2001: proceedings", title = "{OpenMP} shared memory parallel programming: International Workshop on {OpenMP} Applications and Tools, {WOMPAT} 2001, West Lafayette, {IN}, {USA}, July 30--31, 2001: proceedings", volume = "2104", publisher = pub-SV, address = pub-SV:adr, pages = "x + 184", year = "2001", ISBN = "3-540-42346-X (paperback)", ISBN-13 = "978-3-540-42346-1 (paperback)", LCCN = "QA76.642 .I589 2001; QA267.A1 L43 no.2104", bibdate = "Thu Jan 17 11:49:19 MST 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = ser-LNCS, URL = "http://link.springer-ny.com/link/service/series/0558/tocs/t2104.htm", acknowledgement = ack-nhfb, keywords = "parallel programming (computer science) -- congresses", } @Proceedings{USENIX:2001:PJV, editor = "USENIX", booktitle = "Proceedings of the Java Virtual Machine Research and Technology Sy[m]posium (JVM '01): April 23--24, 2001, Monterey, California, USA. Berkeley, CA", title = "Proceedings of the Java Virtual Machine Research and Technology Sy[m]posium ({JVM} '01): April 23--24, 2001, Monterey, California, {USA}. Berkeley, {CA}", publisher = pub-USENIX, address = pub-USENIX:adr, pages = "232", year = "2001", ISBN = "1-880446-11-1", ISBN-13 = "978-1-880446-11-9", LCCN = "QA76.73.J38 J42 2001", bibdate = "Tue Oct 15 12:35:06 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/usenix2000.bib", URL = "http://www.usenix.org/publications/library/proceedings/jvm01/", acknowledgement = ack-nhfb, } @Proceedings{IEEE:2002:STI, editor = "{IEEE}", booktitle = "{SC2002}: From Terabytes to Insight. Proceedings of the {IEEE ACM SC 2002 Conference, November 16--22, 2002, Baltimore, MD, USA}", title = "{SC2002}: From Terabytes to Insight. Proceedings of the {IEEE ACM SC 2002 Conference, November 16--22, 2002, Baltimore, MD, USA}", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "????", year = "2002", ISBN = "0-7695-1524-X", ISBN-13 = "978-0-7695-1524-3", LCCN = "????", bibdate = "Thu Feb 21 18:29:36 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Proceedings{USENIX:2002:PBF, editor = "{USENIX}", booktitle = "Proceedings of BSDCon 2002: February 11--14, 2002, Cathedral Hill Hotel, San Francisco, CA", title = "Proceedings of {BSDCon} 2002: February 11--14, 2002, Cathedral Hill Hotel, San Francisco, {CA}", publisher = pub-USENIX, address = pub-USENIX:adr, pages = "viii + 151", year = "2002", ISBN = "1-880446-02-2", ISBN-13 = "978-1-880446-02-7", LCCN = "QA76.76.O63 B736 2002", bibdate = "Tue Oct 15 12:45:29 2002", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/usenix2000.bib", URL = "http://www.usenix.org/publications/library/proceedings/bsdcon02/tech.html", acknowledgement = ack-nhfb, } @Proceedings{ACM:2003:ATA, editor = "Allyn Romanow and Jeff Mogul", booktitle = "{Proceedings of the ACM SIGCOMM Workshop on Network-I/O Convergence: experience, Lessons, Implications 2003, Karlsruhe, Germany, August 25--27, 2003}", title = "{Proceedings of the ACM SIGCOMM Workshop on Network-I/O Convergence: experience, Lessons, Implications 2003, Karlsruhe, Germany, August 25--27, 2003}", publisher = pub-ACM, address = pub-ACM:adr, pages = "????", year = "2003", ISBN = "????", ISBN-13 = "????", LCCN = "TK5105.5", bibdate = "Sat Oct 14 14:04:48 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", note = "ACM order number 534032.", acknowledgement = ack-nhfb, } @Proceedings{ACM:2003:SII, editor = "{ACM}", booktitle = "SC2003: Igniting Innovation. {Phoenix, AZ, November 15--21, 2003}", title = "{SC2003}: Igniting Innovation. {Phoenix, AZ, November 15--21, 2003}", publisher = pub-ACM # " and " # pub-IEEE, address = pub-ACM:adr # " and " # pub-IEEE:adr, pages = "????", year = "2003", ISBN = "1-58113-695-1", ISBN-13 = "978-1-58113-695-1", LCCN = "????", bibdate = "Thu Feb 21 18:29:36 2003", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Proceedings{Anonymous:2003:CCV, editor = "Anonymous", booktitle = "Cool Chips VI:An International Symposium on Low-Power and High-Speed Chips, Yokohama Joho Bunka Center, Yokohama, Japan (Yokohama Media \& Communications Center, Yokohama, Japan) April 16--18, 2003", title = "Cool Chips {VI}:An International Symposium on Low-Power and High-Speed Chips, Yokohama Joho Bunka Center, Yokohama, Japan (Yokohama Media \& Communications Center, Yokohama, Japan) April 16--18, 2003", publisher = "????", address = "????", pages = "????", year = "2003", ISBN = "????", ISBN-13 = "????", LCCN = "????", bibdate = "Fri Jan 09 16:53:37 2004", bibsource = "https://www.math.utah.edu/pub/tex/bib/cool-chips.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Proceedings{Chapman:2005:SMP, editor = "Barbara M. Chapman", booktitle = "{Shared memory parallel programming with OpenMP: 5th International Workshop on OpenMP Applications and Tools, WOMPAT 2004, Houston, TX, USA, May 17--18, 2004: Revised selected papers}", title = "{Shared memory parallel programming with OpenMP: 5th International Workshop on OpenMP Applications and Tools, WOMPAT 2004, Houston, TX, USA, May 17--18, 2004: Revised selected papers}", volume = "3349", publisher = pub-SV, address = pub-SV:adr, pages = "x + 147", year = "2005", CODEN = "LNCSD9", DOI = "https://doi.org/10.1007/b105895", ISBN = "3-540-24560-X", ISBN-13 = "978-3-540-24560-5", ISSN = "0302-9743 (print), 1611-3349 (electronic)", LCCN = "QA76 .A1 L42 NO.3349", bibdate = "Thu Jun 2 07:26:02 MDT 2005", bibsource = "clavis.ucalgary.ca:2200/UNICORN; https://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = ser-LNCS, URL = "http://www.springerlink.com/openurl.asp?genre=issue&issn=0302-9743&volume=3349; http://www.springerlink.com/openurl.asp?genre=volume&id=doi:10.1007/b105895", acknowledgement = ack-nhfb, meetingname = "International Workshop on OpenMP Applications and Tools (2004: Houston, Tex.)", subject = "Parallel programming (Computer science); Congresses", } @Proceedings{Lathrop:2011:SPI, editor = "Scott Lathrop and Jim Costa and William Kramer", booktitle = "{SC'11: Proceedings of 2011 International Conference for High Performance Computing, Networking, Storage and Analysis, Seattle, WA, November 12--18 2011}", title = "{SC'11: Proceedings of 2011 International Conference for High Performance Computing, Networking, Storage and Analysis, Seattle, WA, November 12--18 2011}", publisher = pub-ACM # " and " # pub-IEEE, address = pub-ACM:adr # " and " # pub-IEEE:adr, pages = "????", year = "2011", ISBN = "1-4503-0771-X", ISBN-13 = "978-1-4503-0771-0", LCCN = "????", bibdate = "Fri Dec 16 11:11:35 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/supercomputing2011.bib", acknowledgement = ack-nhfb, xxeditor = "{ACM}", } @Proceedings{Hollingsworth:2012:SPI, editor = "Jeffrey Hollingsworth", booktitle = "{SC '12: Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis, Salt Lake Convention Center, Salt Lake City, UT, USA, November 10--16, 2012}", title = "{SC '12: Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis, Salt Lake Convention Center, Salt Lake City, UT, USA, November 10--16, 2012}", publisher = pub-IEEE, address = pub-IEEE:adr, year = "2012", ISBN = "1-4673-0804-8", ISBN-13 = "978-1-4673-0804-5", bibdate = "Thu Nov 15 07:35:55 2012", bibsource = "https://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib; https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", acknowledgement = ack-nhfb, }